Ticket #1550: 0005-Enhmt-1550-sge_sort_job_nodes-sorts-both-running-and.patch

File 0005-Enhmt-1550-sge_sort_job_nodes-sorts-both-running-and.patch, 7.6 KB (added by markdixon, 4 years ago)

5/6

  • source/libs/sched/sgeee.c

    From 4a07ceaadc3a41794cec5beba8d904704e84bcfd Mon Sep 17 00:00:00 2001
    From: Mark Dixon <m.c.dixon@leeds.ac.uk>
    Date: Thu, 13 Aug 2015 09:43:04 +0100
    Subject: [PATCH 5/6] Enhmt #1550 sge_sort_job_nodes sorts both running and pending jobs
    
    Does not change the scheduler's result, but will require more effort to
    run, as more of the sharetree will be processed (it previously ignored
    nodes without pending jobs).
    ---
     source/libs/sched/sgeee.c |   81 +++++++++++++++++++++------------------------
     1 files changed, 38 insertions(+), 43 deletions(-)
    
    diff --git a/source/libs/sched/sgeee.c b/source/libs/sched/sgeee.c
    index f692305..9f15288 100644
    a b sge_calc_tickets( scheduler_all_data_t *lists, 
    30023002         for(job_ndx=0; job_ndx<num_jobs; job_ndx++) {
    30033003            sge_ref_t *jref = &job_ref[job_ndx];
    30043004            lListElem *node = jref->node;
    3005             if (jref->queued) {
    3006                if (node) {
    3007                   /* add each job to the share tree as a temporary sibling node */
    3008                   char tmpstr[64];
    3009                   lListElem *child;
    3010 
    3011                   job = jref->job;
    3012                   sprintf(tmpstr, sge_u32"."sge_u32, lGetUlong(job, JB_job_number),
    3013                          REF_GET_JA_TASK_NUMBER(jref));
    3014                   child = lAddSubStr(node, STN_name, tmpstr, STN_children, STN_Type);
    3015                   lSetUlong(child, STN_jobid, lGetUlong(job, JB_job_number));
    3016                   lSetUlong(child, STN_taskid, REF_GET_JA_TASK_NUMBER(jref));
    3017                   lSetUlong(child, STN_temp, 1);
    3018                   /* save the job reference, so we can refer to it later to set job fields */
    3019                   lSetUlong(child, STN_ref, job_ndx+1);
    3020                   if (hierarchy[policy_ndx].dependent) {
    3021                      /* set the sort value based on tickets of higher level policy */
    3022                      lSetDouble(child, STN_tickets, jref->tickets);
    3023                      lSetDouble(child, STN_sort,
    3024                                 jref->tickets + (0.01 * (double)lGetUlong(job, JB_jobshare)));
    3025                   } else
    3026                      /* set the sort value based on the priority of the job */
    3027                      lSetDouble(child, STN_sort, (double)lGetUlong(job, JB_jobshare));
    3028                }
     3005            if (node) {
     3006               /* add each job to the share tree as a temporary sibling node */
     3007               char tmpstr[64];
     3008               lListElem *child;
     3009
     3010               job = jref->job;
     3011               sprintf(tmpstr, sge_u32"."sge_u32, lGetUlong(job, JB_job_number),
     3012                      REF_GET_JA_TASK_NUMBER(jref));
     3013               child = lAddSubStr(node, STN_name, tmpstr, STN_children, STN_Type);
     3014               lSetUlong(child, STN_jobid, lGetUlong(job, JB_job_number));
     3015               lSetUlong(child, STN_taskid, REF_GET_JA_TASK_NUMBER(jref));
     3016               lSetUlong(child, STN_queued, jref->queued);
     3017               lSetUlong(child, STN_temp, 1);
     3018               /* save the job reference, so we can refer to it later to set job fields */
     3019               lSetUlong(child, STN_ref, job_ndx+1);
     3020               if (hierarchy[policy_ndx].dependent) {
     3021                  /* set the sort value based on tickets of higher level policy */
     3022                  lSetDouble(child, STN_tickets, jref->tickets);
     3023                  lSetDouble(child, STN_sort,
     3024                             jref->tickets + (0.01 * (double)lGetUlong(job, JB_jobshare)));
     3025               } else
     3026                  /* set the sort value based on the priority of the job */
     3027                  lSetDouble(child, STN_sort, (double)lGetUlong(job, JB_jobshare));
    30293028            }
    30303029         }
    30313030
    sge_calc_tickets( scheduler_all_data_t *lists, 
    30383037             */
    30393038            for_each(job_node, sorted_job_node_list) {
    30403039               sge_ref_t *jref = &job_ref[lGetUlong(job_node, STN_ref)-1];
    3041                REF_SET_STICKET(jref,
     3040               if (jref->queued) {
     3041                  REF_SET_STICKET(jref,
    30423042                     lGetDouble(job_node, STN_shr) * total_share_tree_tickets);
    3043                if (hierarchy[policy_ndx].dependent)
    3044                   jref->tickets += REF_GET_STICKET(jref);
     3043                  if (hierarchy[policy_ndx].dependent)
     3044                     jref->tickets += REF_GET_STICKET(jref);
     3045               }
    30453046            }
    30463047            lFreeList(&sorted_job_node_list);
    30473048         }
    sge_sort_job_nodes(lListElem *root, 
    34073408                   sge_ref_t *job_ref)
    34083409{
    34093410   /* Prune sparsely populated sharetree */
    3410    /* Dispose of uppermost levels without pending jobs */
     3411   /* Dispose of uppermost levels without jobs */
    34113412   if(root == node){
    34123413      int active_nodes = 0;
    34133414      lListElem *child;
    sge_sort_job_nodes(lListElem *root, 
    34173418         if (lGetUlong(child, STN_ref)) {
    34183419            active_nodes++;
    34193420            break;
    3420          } else if ((lGetUlong(child, STN_job_ref_count)-lGetUlong(child, STN_active_job_ref_count))>0) {
     3421         } else if (lGetUlong(child, STN_job_ref_count)) {
    34213422            temp_root = child;
    34223423            active_nodes++;
    34233424         }
    sge_sort_job_nodes(lListElem *root, 
    34273428
    34283429      /*
    34293430       * Prune this node if:-
    3430        * - There are no pending jobs at higher levels, or in sibling nodes
    3431        * - It has no pending jobs
    3432        * - It has only one child object that contains pending jobs
     3431       * - There are no jobs at higher levels, or in sibling nodes
     3432       * - It has no jobs
     3433       * - It has only one child object that contains jobs
    34333434       */
    34343435      if (active_nodes == 1 && temp_root)
    34353436         return sge_sort_job_nodes(temp_root, temp_root, total_share_tree_tickets, job_ref);
    sge_sort_job_nodes(lListElem *root, 
    34473448            job_node_list = lCreateList("sorted job node list", STN_Type);
    34483449         lAppendElem(job_node_list, lCopyElem(child));
    34493450         job_nodes++;
    3450       } else if ((lGetUlong(child, STN_job_ref_count)-lGetUlong(child, STN_active_job_ref_count))>0) {
     3451      } else if (lGetUlong(child, STN_job_ref_count)) {
    34513452         lList *child_job_node_list;
    34523453         /* recursively get all the child job nodes onto our list */
    34533454         if ((child_job_node_list = sge_sort_job_nodes(root, child, total_share_tree_tickets, job_ref))) {
    sge_sort_job_nodes(lListElem *root, 
    34633464   if (job_nodes)
    34643465      lSetList(node, STN_children, NULL);
    34653466
    3466    /* sort the job nodes based on the calculated pending priority */
     3467   /* sort the job nodes based on the calculated priority */
    34673468   if (root != node || job_nodes) {
    34683469      lListElem *u;
    34693470
    34703471      if (job_node_list && lGetNumberOfElem(job_node_list)>1)
    3471          lPSortList(job_node_list, "%I- %I+ %I+", STN_sort, STN_jobid, STN_taskid);
     3472         lPSortList(job_node_list, "%I+ %I- %I+ %I+", STN_queued, STN_sort, STN_jobid, STN_taskid);
    34723473
    3473       /* calculate a new pending priority -
    3474          The priority of each pending job associated with this node is the
     3474      /* calculate a new priority -
     3475         The priority of each job associated with this node is the
    34753476         node's short term entitlement (STN_stt) divided by the number of jobs
    34763477         which are scheduled ahead of this node times the number of share tree
    34773478         tickets. If we are dependent on another higher-level policy, we also add
    34783479         the tickets from those policies. */
    34793480
    3480       /* - get running job count */
    3481       job_count = lGetUlong(node, STN_active_job_ref_count);
    3482       if ((u=lGetElemStr(lGetList(node, STN_usage_list), UA_name,
    3483                          "finished_jobs")))
    3484          job_count += lGetDouble(u, UA_value);
    3485 
    34863481      /* - get entitlement */
    34873482      double node_stt = lGetDouble(node, STN_stt);
    34883483