[GE users] threaded jobs (no PE) and consumable memory

txema_heredia txema.heredia at upf.edu
Wed Jan 13 17:31:02 GMT 2010


Hi all,

I've found a problem in SGE 6.1u4 regarding threaded jobs (without using PE) when submitted to a host with an h_vmem request:

I want to run "blastall -a 8"  in my cluster (the -a allows the process to use N threads to run its analysis, but it doesn't require a parallel environment, it uses libpthread.so.0).

I can run it without any problem from the command line of my front-end, and any of my execution hosts, but if I qsub it, it finishes unexpectedly and creates a core dump file.

I have been doing some research, and I have found that the problem lies in the fact that the program tries to do a malloc for all the memory given by -l h_vmem, and thus killing the process.

Here is some data:


blastall -a 1
without -l h_vmem
works OK

stat("/.../env_nt.02.nsi", {st_mode=S_IFREG|0644, st_size=5555499, ...}) = 0
mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab757e000
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab75ff000
munmap(0x2aaab757e000, 528384) = 0
mmap(NULL, 708608, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab7620000
stat("/.../env_nt.00.nsq", {st_mode=S_IFREG|0644, st_size=780519865, ...}) = 0


--------------------------------------------------------------------

blastall -a 8
without -l h_vmem
works OK

stat("/.../env_nt.02.nsi", {st_mode=S_IFREG|0644, st_size=5555499, ...}) = 0
mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab757e000
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab75ff000
munmap(0x2aaab757e000, 528384)    = 0
mmap(NULL, 2101248, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = 0x40000000
--- child creation ---
mprotect(0x40000000, 4096, PROT_NONE) = 0
clone(child_stack=0x40200250, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x402009d0, tls=0x40200940, child_tidptr=0x402009d0) = 1757
mmap(NULL, 2101248, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = 0x40201000
mprotect(0x40201000, 4096, PROT_NONE) = 0
clone(child_stack=0x40401250, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x404019d0, tls=0x40401940, child_tidptr=0x404019d0) = 1758
[...]
futex(0x402009d0, FUTEX_WAIT, 1757, NULL <unfinished ...>


--------------------------------------------------------------------

blastall -a 1
-l h_vmem=7G
works OK

stat("/.../env_nt.02.nsi", {st_mode=S_IFREG|0644, st_size=5555499, ...}) = 0
mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab757e000
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab75ff000
munmap(0x2aaab757e000, 528384) = 0
mmap(NULL, 708608, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab7620000
stat("/.../env_nt.00.nsq", {st_mode=S_IFREG|0644, st_size=780519865, ...}) = 0


--------------------------------------------------------------------

blastall -a 8
-l h_vmem=7G
DOESN'T WORK

stat("/scratch/mgarcia/db/env_nt.02.nsi", {st_mode=S_IFREG|0644, st_size=5555499, ...}) = 0
mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab757e000
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab75ff000
munmap(0x2aaab757e000, 528384) = 0
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
brk(0xfa37000) = 0xfa37000
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
+++ killed by SIGSEGV (core dumped) +++


--------------------------------------------------------------------

blastall -a 8
-l s_vmem=7G
DOESN'T WORK

stat("/scratch/mgarcia/db/env_nt.02.nsi", {st_mode=S_IFREG|0644, st_size=5555499, ...}) = 0
mmap(NULL, 528384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab757e000
mmap(NULL, 135168, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaab75ff000
munmap(0x2aaab757e000, 528384)    = 0
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
brk(0x5e9d000)                    = 0x5e9d000
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|0x40, -1, 0) = -1 ENOMEM (Cannot allocate memory)
mmap(NULL, 7516196864, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = -1 ENOMEM (Cannot allocate memory)
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
+++ killed by SIGSEGV (core dumped) +++


--------------------------------------------------------------------



Why is it trying to allocate that much memory before creating threads when h_vmem or s_vmem are requested??

Is it trying to allocate for each child the same memory I requested for the parent job??

Is there any workaround?



PS: I need to ask for h_vmem, as it is a consumable resource in my host, and if I say nothing, the default value is -l h_vmem=0

------------------------------------------------------
http://gridengine.sunsource.net/ds/viewMessage.do?dsForumId=38&dsMessageId=238572

To unsubscribe from this discussion, e-mail: [users-unsubscribe at gridengine.sunsource.net].



More information about the gridengine-users mailing list