[GE users] installing exec host hangs at start of sge_commd

Don Shesnicky dshesnicky at enqsemi.com
Tue Jul 6 21:05:47 BST 2004



> Is it sure your problem is not related to a /etc/hosts file where the 
> hostname of the machine as aliased to the loopback address 127.0.0.1?

So I've straightened out the hostname and /etc/hosts file, pulled the
6.0
binaries and started installing. Now the sge_qmaster daemon is failing.
The startup script hangs for awhile but I think that's just because it's
checking to see if it starts up. If I run the sge_qmaster from the
command
line it returns the prompt immediately with no errors and nothing in the
messages file. If I run a "strace sqe_master" it returns quickly, if I
use strace -f to follow forks it hangs forever, partial trace follows.

This is from the script startup but there's not much info there, some
of the info is from my echo statements:

   starting sge_qmaster
start cmd: /tools/sge/6.0/bin/lx24-x86/sge_qmaster
---> checking for sge_qmaster
---> ps -efww | grep sge_qmaster
root     14344 14286  0 15:50 pts/8    00:00:00 grep sge_qmaster
masterhost: canter.enqsemi.com

sge_qmaster didn't start!
Please check the messages file

   starting sge_schedd
error: getting configuration: unable to contact qmaster using port 536
on host "canter.enqsemi.com"
error: can't get configuration from qmaster -- backgrounding

Don 



At the end of this strace output it seems to be waiting on the fork from
this
clone call:

[pid 30781] clone(child_stack=0x40fffbd8,
flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|0x21) = 30785

<snip>
[pid 30785] close(5)                    = 0
[pid 30785] munmap(0x40027000, 4096)    = 0
[pid 30785] socket(PF_UNIX, SOCK_STREAM, 0) = 5
[pid 30785] connect(5, {sin_family=AF_UNIX,
path="/var/run/.nscd_socket"}, 110) = -1 ENOENT (No such file or
directory)
[pid 30785] close(5)                    = 0
[pid 30785] open("/etc/hosts", O_RDONLY) = 5
[pid 30785] fcntl64(0x5, 0x1, 0, 0x1)   = 0
[pid 30785] fcntl64(0x5, 0x2, 0x1, 0x2 <unfinished ...>
[pid 30782] <... nanosleep resumed> 0)  = -1 EINTR (Interrupted system
call)
[pid 30785] <... fcntl64 resumed> )     = 0
[pid 30782] --- SIGRT_0 (Real-time signal 0) ---
[pid 30785] fstat64(5, {st_mode=S_IFREG|0644, st_size=167, ...}) = 0
[pid 30781] <... poll resumed> [{fd=3, events=POLLIN, revents=POLLIN}],
1, 2000) = 1
[pid 30785] mmap2(NULL, 4096, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
[pid 30781] --- SIGRT_1 (Real-time signal 1) ---
[pid 30785] <... mmap2 resumed> )       = 0x40027000
[pid 30781] sigreturn( <unfinished ...>
[pid 30785] read(5,  <unfinished ...>
[pid 30781] <... sigreturn resumed> )   = ? (mask now ~[TRAP KILL STOP])
[pid 30785] <... read resumed> "# Do not remove the following li"...,
4096) = 167
[pid 30781] getppid( <unfinished ...>
[pid 30785] read(5,  <unfinished ...>
[pid 30781] <... getppid resumed> )     = 30779
[pid 30785] <... read resumed> "", 4096) = 0
[pid 30781] wait4(-1,  <unfinished ...>
[pid 30785] close(5 <unfinished ...>
[pid 30781] <... wait4 resumed> [WIFEXITED(s) && WEXITSTATUS(s) == 0],
WNOHANG|__WCLONE, NULL) = 30782
[pid 30785] <... close resumed> )       = 0
[pid 30781] wait4(-1,  <unfinished ...>
[pid 30785] munmap(0x40027000, 4096 <unfinished ...>
[pid 30781] <... wait4 resumed> 0x81c4798, WNOHANG|__WCLONE, NULL) = -1
ECHILD (No child processes)
[pid 30785] <... munmap resumed> )      = 0
[pid 30781] read(3,
"`\35\6@\1\0\0\0\2\4\0\0\370E\6@\264Z\33\10\250$\34\10\270"..., 148) =
148
[pid 30785] open("/var/yp/binding/nis.enqsemi.com.2", O_RDONLY
<unfinished ...>
[pid 30781] modify_ldt(0x1, 0x81c473c, 0x10 <unfinished ...>
[pid 30785] <... open resumed> )        = 5
[pid 30781] <... modify_ldt resumed> )  = 0
[pid 30785] readv(5,  <unfinished ...>
[pid 30781] munmap(0x401e4000, 8388608 <unfinished ...>
[pid 30785] <... readv resumed> [{"\377\377", 2},
{"\1\0\0\0\n\n\n\n\3\205\0\0", 12}], 2) = 14
[pid 30781] <... munmap resumed> )      = 0
[pid 30785] brk(0x81e2000 <unfinished ...>
[pid 30781] poll( <unfinished ...>
[pid 30785] <... brk resumed> )         = 0x81e2000
[pid 30781] <... poll resumed> [{fd=3, events=POLLIN, revents=POLLIN}],
1, 2000) = 1
[pid 30785] socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP <unfinished ...>
[pid 30781] getppid( <unfinished ...>
[pid 30785] <... socket resumed> )      = 6
[pid 30781] <... getppid resumed> )     = 30779
[pid 30785] bind(6, {sin_family=AF_INET, sin_port=htons(858),
sin_addr=inet_addr("0.0.0.0")}}, 16 <unfinished ...>
[pid 30781] read(3,  <unfinished ...>
[pid 30785] <... bind resumed> )        = -1 EACCES (Permission denied)
[pid 30781] <... read resumed>
"`\35\6@\2\0\0\0\1\0\0\0j\20\30@\344\242\31@\324\21\36@"..., 148) = 148
[pid 30785] ioctl(6, 0x5421 <unfinished ...>
[pid 30781] kill(30785, SIGRT_1 <unfinished ...>
[pid 30785] <... ioctl resumed> , [1])  = 0
[pid 30781] <... kill resumed> )        = 0
[pid 30785] --- SIGRT_1 (Real-time signal 1) ---
[pid 30781] kill(30784, SIGRT_1 <unfinished ...>
[pid 30785] _exit(1)                    = ?
[pid 30784] <... nanosleep resumed> 0)  = -1 EINTR (Interrupted system
call)
[pid 30781] <... kill resumed> )        = 0
[pid 30784] --- SIGRT_1 (Real-time signal 1) ---
[pid 30781] --- SIGRT_1 (Real-time signal 1) ---
[pid 30784] _exit(1)                    = ?
sigreturn()                             = ? (mask now ~[TRAP KILL STOP])
--- SIGRT_1 (Real-time signal 1) ---
sigreturn()                             = ? (mask now ~[TRAP KILL STOP])
wait4(30785,  <unfinished ...>

---------------------------------------------------------------------
To unsubscribe, e-mail: users-unsubscribe at gridengine.sunsource.net
For additional commands, e-mail: users-help at gridengine.sunsource.net




More information about the gridengine-users mailing list