Opened 14 years ago

Last modified 9 years ago

#274 new defect

IZ1791: accounting exec node is incorrectly filled in with inf

Reported by: sgrell Owned by:
Priority: low Milestone:
Component: sge Version: 6.0
Severity: Keywords: execution
Cc:

Description

[Imported from gridengine issuezilla http://gridengine.sunsource.net/issues/show_bug.cgi?id=1791]

        Issue #:      1791             Platform:     All      Reporter: sgrell (sgrell)
       Component:     gridengine          OS:        All
     Subcomponent:    execution        Version:      6.0         CC:    None defined
        Status:       NEW              Priority:     P4
      Resolution:                     Issue type:    DEFECT
                                   Target milestone: ---
      Assigned to:    sgrell (sgrell)
      QA Contact:     pollinger
          URL:
       * Summary:     accounting exec node is incorrectly filled in with inf
   Status whiteboard:
      Attachments:

     Issue 1791 blocks:
   Votes for issue 1791:


   Opened: Tue Sep 13 01:33:00 -0700 2005 
------------------------


The reporting file if the qmaster contains Inf and NaN values for sharelog
reporting values.

Kris Buggenhout reported this error. He gave me the share tree configuration of
the affected installation:

root@babbage # qconf -sstree
id=0
name=Root
type=0
shares=1
childnodes=1,13,14,40,46,80
id=1
name=biomedische
type=0
shares=24
childnodes=2,6
id=2
name=cvbroeck-g
type=0
shares=12
childnodes=3,4,5
id=3
name=bharding
type=0
shares=0
childnodes=NONE
id=4
name=derijkp
type=0
shares=0
childnodes=NONE
id=5
name=glassee
type=0
shares=0
childnodes=NONE
id=6
name=erikds-g
type=0
shares=12
childnodes=7,8,9,10,11,12
id=7
name=erikds
type=0
shares=0
childnodes=NONE
id=8
name=mberends
type=0
shares=0
childnodes=NONE
id=9
name=pachar
type=0
shares=0
childnodes=NONE
id=10
name=rmaex
type=0
shares=0
childnodes=NONE
id=11
name=wils
type=0
shares=0
childnodes=NONE
id=12
name=wvangeit
type=0
shares=0
childnodes=NONE
id=13
name=extern
type=1
shares=10
childnodes=NONE
id=14
name=fysica
type=0
shares=14
childnodes=15,21
id=15
name=lamoen-g
type=0
shares=0
childnodes=16,17,18,19,20
id=16
name=jorissen
type=0
shares=0
childnodes=NONE
id=17
name=jtitanta
type=0
shares=0
childnodes=NONE
id=18
name=lamoen
type=0
shares=0
childnodes=NONE
id=19
name=mascho
type=0
shares=0
childnodes=NONE
id=20
name=socu
type=0
shares=0
childnodes=NONE
id=21
name=peeters-g
type=0
shares=0
childnodes=22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
id=22
name=aslachmu
type=0
shares=0
childnodes=NONE
id=23
name=azamat
type=0
shares=0
childnodes=NONE
id=24
name=bbaelus
type=0
shares=0
childnodes=NONE
id=25
name=bpartoen
type=0
shares=0
childnodes=NONE
id=26
name=golib
type=0
shares=0
childnodes=NONE
id=27
name=kwinten
type=0
shares=0
childnodes=NONE
id=28
name=maarten
type=0
shares=0
childnodes=NONE
id=29
name=milo
type=0
shares=0
childnodes=NONE
id=30
name=mtadic
type=0
shares=0
childnodes=NONE
id=31
name=peeters
type=0
shares=0
childnodes=NONE
id=32
name=piacente
type=0
shares=0
childnodes=NONE
id=33
name=predrag
type=0
shares=0
childnodes=NONE
id=34
name=riva
type=0
shares=0
childnodes=NONE
id=35
name=s0020389
type=0
shares=0
childnodes=NONE
id=36
name=sandor
type=0
shares=0
childnodes=NONE
id=37
name=sergio
type=0
shares=0
childnodes=NONE
id=38
name=szafran
type=0
shares=0
childnodes=NONE
id=39
name=vladan
type=0
shares=0
childnodes=NONE
id=40
name=letteren
type=0
shares=12
childnodes=41,42,43,44,45
id=41
name=daelem
type=0
shares=0
childnodes=NONE
id=42
name=decadt
type=0
shares=0
childnodes=NONE
id=43
name=depauw
type=0
shares=0
childnodes=NONE
id=44
name=hoste
type=0
shares=0
childnodes=NONE
id=45
name=jmeyhi
type=0
shares=0
childnodes=NONE
id=46
name=scheikunde
type=0
shares=21
childnodes=47,60,70
id=47
name=abogaert-g
type=0
shares=0
childnodes=48,49,50,51,52,53,54,55,56,57,58,59
id=48
name=abogaert
type=0
shares=0
childnodes=NONE
id=49
name=bleecker
type=0
shares=0
childnodes=NONE
id=50
name=chen
type=0
shares=0
childnodes=NONE
id=51
name=kolev
type=0
shares=0
childnodes=NONE
id=52
name=madani
type=0
shares=0
childnodes=NONE
id=53
name=neyts
type=0
shares=0
childnodes=NONE
id=54
name=s0015103
type=0
shares=0
childnodes=NONE
id=55
name=s9965997
type=0
shares=0
childnodes=NONE
id=56
name=s9995504
type=0
shares=0
childnodes=NONE
id=57
name=vantdack
type=0
shares=0
childnodes=NONE
id=58
name=vgeorg
type=0
shares=0
childnodes=NONE
id=59
name=yanhong
type=0
shares=0
childnodes=NONE
id=60
name=alsenoy-g
type=0
shares=0
childnodes=61,62,63,64,65,66,67,68,69
id=61
name=akrishta
type=0
shares=0
childnodes=NONE
id=62
name=alsenoy
type=0
shares=0
childnodes=NONE
id=63
name=baeke
type=0
shares=0
childnodes=NONE
id=64
name=cwuyts
type=0
shares=0
childnodes=NONE
id=65
name=fblockhu
type=0
shares=0
childnodes=NONE
id=66
name=howard
type=0
shares=0
childnodes=NONE
id=67
name=ktersago
type=0
shares=0
childnodes=NONE
id=68
name=nivesano
type=0
shares=0
childnodes=NONE
id=69
name=s0040623
type=0
shares=0
childnodes=NONE
id=70
name=woher-g
type=0
shares=0
childnodes=71,72,73,74,75,76,77,78,79
id=71
name=bvdveken
type=0
shares=0
childnodes=NONE
id=72
name=edebie
type=0
shares=0
childnodes=NONE
id=73
name=jojans
type=0
shares=0
childnodes=NONE
id=74
name=petervg
type=0
shares=0
childnodes=NONE
id=75
name=s0025089
type=0
shares=0
childnodes=NONE
id=76
name=s0025357
type=0
shares=0
childnodes=NONE
id=77
name=tkupp
type=0
shares=0
childnodes=NONE
id=78
name=wimv
type=0
shares=0
childnodes=NONE
id=79
name=woher
type=0
shares=0
childnodes=NONE
id=80
name=wis-inf
type=0
shares=12
childnodes=81,86,94
id=81
name=bnaudts-g
type=0
shares=12
childnodes=82,83,84,85
id=82
name=bnaudts
type=0
shares=0
childnodes=NONE
id=83
name=efernan
type=0
shares=0
childnodes=NONE
id=84
name=kovle
type=0
shares=0
childnodes=NONE
id=85
name=pvremort
type=0
shares=0
childnodes=NONE
id=86
name=broeckho-g
type=0
shares=6
childnodes=87,88,89,90,91,92,93
id=87
name=arickx
type=0
shares=0
childnodes=NONE
id=88
name=broeckho
type=0
shares=0
childnodes=NONE
id=89
name=dewolfs
type=0
shares=0
childnodes=NONE
id=90
name=kuvme
type=0
shares=0
childnodes=NONE
id=91
name=pehe
type=0
shares=0
childnodes=NONE
id=92
name=tdhaene
type=0
shares=0
childnodes=NONE
id=93
name=wohe
type=0
shares=0
childnodes=NONE
id=94
name=cuyt-g
type=0
shares=6
childnodes=95,96,97,98,99,100,101,102,103
id=95
name=cuyt
type=0
shares=0
childnodes=NONE
id=96
name=ghen
type=0
shares=0
childnodes=NONE
id=97
name=lenin
type=0
shares=0
childnodes=NONE
id=98
name=nvdbergh
type=0
shares=0
childnodes=NONE
id=99
name=s0025171
type=0
shares=0
childnodes=NONE
id=100
name=sbecuwe
type=0
shares=0
childnodes=NONE
id=101
name=tdocx
type=0
shares=0
childnodes=NONE
id=102
name=verdonk
type=0
shares=0
childnodes=NONE
id=103
name=wlee
type=0
shares=0
*** (#1 of 2): 2005-08-09 10:21:05 CEST richard.hierlmeier@sun.com

Reaction of the new dbwriter on wrong entries...

09/08/2005 09:06:28|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_actual_share got 'NaN' double value, assume 0
09/08/2005 09:06:28|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_usage got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_actual_share got 'NaN' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_usage got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_cpu got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_ltcpu got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_actual_share got 'NaN' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_usage got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_actual_share got 'NaN' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_usage got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_cpu got 'Inf' double value, assume 0
09/08/2005 09:06:29|babbage.cmi.ua.ac.be|r.db.DatabaseField.parseDouble|E|field
sl_ltcpu got 'Inf' double value, assume 0


relevant entries in the reporting.processing file...

1123535707:sharelog:1123535707:0:Root:::1:166:1.000000:1.000000:1.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123535707:sharelog:1123535707:0:fysica::fysica:14:128:0.150538:0.150538:0.400000:0.000000:NaN:Inf:Inf:136096287516.860565:0.000000:Inf:223562980726.100525:0.000000:
1123535707:sharelog:1123535707:0:lamoen-g::fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123535707:sharelog:1123535707:1123535704:socu:socu:fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:Inf:135955501346.559906:0.000000:Inf:223370966908.878235:0.000000:
1123539307:sharelog:1123539307:0:Root:::1:165:1.000000:1.000000:1.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123539307:sharelog:1123539307:0:fysica::fysica:14:127:0.150538:0.150538:0.400000:0.000000:NaN:Inf:Inf:136035313303.346176:0.000000:Inf:223563391787.374237:0.000000:
1123539307:sharelog:1123539307:0:lamoen-g::fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123539307:sharelog:1123539307:1123539303:socu:socu:fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:Inf:135894179665.903000:0.000000:Inf:223370966908.878235:0.000000:
1123542907:sharelog:1123542907:0:Root:::1:163:1.000000:1.000000:1.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123542907:sharelog:1123542907:0:fysica::fysica:14:125:0.150538:0.150538:0.400000:0.000000:NaN:Inf:Inf:135974321481.000214:0.000000:Inf:223563774772.040771:0.000000:
1123542907:sharelog:1123542907:0:lamoen-g::fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:0.000000:0.000000:0.000000:0.000000:0.000000:0.000000:
1123542907:sharelog:1123542907:1123542903:socu:socu:fysica:0:0:0.000000:0.000000:0.000000:0.000000:NaN:Inf:Inf:135832868616.912125:0.000000:Inf:223370966908.878235:0.000000:
*** (#2 of 2): 2005-08-09 11:09:58 CEST richard.hierlmeier@sun.com

   ------- Additional comments from sgrell Tue Nov 29 09:00:24 -0700 2005 -------
This can happen to the sharetree, when the usage grows to infinity.

What would be a good work around?

In theory, it should not happen, due to the decay, but what if someone
turns the decay off?

Cheers,
Stephan

   ------- Additional comments from svdavidson Tue Nov 29 15:40:48 -0700 2005 -------
It seems to me that the Inf value is probably caused by a bug or a strange
configuration value. If the values are being stored in the DB as double
precision floating point values, what other than a bug would produce values of
infinity? 100,000 CPUs used for 10 years is 31,536,000,000,000 CPU seconds which
is easily represented as a double precision floating point value (but which is
pushing the 53 bits of precision on an x86 double - you can add a second of CPU
usage to it, but not a millisecond). I guess this could happen if someone used a
ridiculous scaling factor. The best thing might be to add code to recognize this
situation and log an error message, so that we can find the problem. If you want
to avoid crashing the scheduler, you could also consider capping the maximum to
some huge value.

   ------- Additional comments from sgrell Wed Nov 30 02:13:33 -0700 2005 -------
I agree, that it is unlikely to hit the double limit. In this case, we do not
know anything about the cluster. Adding a warning, when this happens is a easy
thing to do and it will not destroy anything.
I went over the source code and had the provided sharetree running for some time
and both seemed to be correct. I could not reproduce the described issue and did
not find a bug in the source.

Stephan

   ------- Additional comments from sgrell Mon Dec 12 02:24:54 -0700 2005 -------
in some cases the accounting of a job is filled in with value inf, which causes
the accountin to be undependable.

==============================================================
qname        normalprio
hostname     node31r05e0
group        lamoen
owner        socu
project      fysica
department   lamoen
jobname      dofeff1J
jobnumber    63740
taskid       undefined
account      sge
priority     0
qsub_time    Thu Jun 23 14:54:30 2005
start_time   Thu Jun 23 10:31:51 2005
end_time     Thu Jun 23 15:01:31 2005
granted_pe   NONE
slots        1
failed       100 : assumedly after job
exit_status  137
ru_wallclock 16180
ru_utime     0
ru_stime     0
ru_maxrss    0
ru_ixrss     0
ru_ismrss    0
ru_idrss     0
ru_isrss     0
ru_minflt    4517
ru_majflt    7129
ru_nswap     0
ru_inblock   0
ru_oublock   0
ru_msgsnd    0
ru_msgrcv    0
ru_nsignals  0
ru_nvcsw     0
ru_nivcsw    0
cpu          Inf
mem          14425.581
io           0.000
iow          0.000
maxvmem      927.117M
==============================================================
qname        normalprio
hostname     node31r05e0
group        lamoen
owner        socu
project      fysica
department   lamoen
jobname      dofeff1J
jobnumber    63739
taskid       undefined
account      sge
priority     0
qsub_time    Thu Jun 23 14:54:21 2005
start_time   Thu Jun 23 10:31:40 2005
end_time     Fri Jun 24 05:15:28 2005
granted_pe   NONE
slots        1
failed       100 : assumedly after job
exit_status  137
ru_wallclock 67428
ru_utime     0
ru_stime     0
ru_maxrss    0
ru_ixrss     0
ru_ismrss    0
ru_idrss     0
ru_isrss     0
ru_minflt    4517
ru_majflt    7129
ru_nswap     0
ru_inblock   0
ru_oublock   0
ru_msgsnd    0
ru_msgrcv    0
ru_nsignals  0
ru_nvcsw     0
ru_nivcsw    0
cpu          Inf
mem          223338313898.996
io           0.000
iow          0.000
maxvmem      927.117M
HOST        OWNER     WALLCLOCK         UTIME         STIME           CPU
      MEMORY                 IO                IOW
==================================================================================================================================
node31r05e0 socu         178753         95070            50           Inf
223338413562.176              0.000              0.000

from exec node:
06/23/2005 14:01:31|execd|node31r05e0|W|job 63740 exceeds job hard limit "h_cpu"
of queue "normalprio@node31r05e0" (     inf >
limit:1797693134862315708145274237317043567980705675258449965989174768031572607800285387605895586327668781715404589535143824642343213268894641827684675467035375169860499105765512820762454900903893289440758685084551339423045832369032229481658085593321233482
74797826204144723168738177180919299881250404026184124858368.00000)
- sending SIGKILL
06/24/2005 04:15:28|execd|node31r05e0|W|job 63739 exceeds job hard limit "h_cpu"
of queue "normalprio@node31r05e0" (     inf >
limit:1797693134862315708145274237317043567980705675258449965989174768031572607800285387605895586327668781715404589535143824642343213268894641827684675467035375169860499105765512820762454900903893289440758685084551339423045832369032229481658085593321233482
74797826204144723168738177180919299881250404026184124858368.00000)
- sending SIGKILL



sadly I have no more information on the jobs... than this...
kris.buggenhout@sun.com 2005-07-08 14:38:29 GMT
*** (#1 of 1): 2005-07-08 16:38:29 CEST kris.buggenhout@sun.com

Change History (0)

Note: See TracTickets for help on using tickets.