Well, it's interesting... I'm now using semaphores to protect the shared memory and I'm still getting the same behavior as before where for somereason one of the child processes goes to sleep... this time however using strace, it appears that the last called sys call is
Code:
futex(0x7f21e73005d4, FUTEX_WAIT_PRIVATE, 2, NULL
Instead of the regular FUTEX_WAIT.
When I attach to the child with gdb and do a backtrace I it looks its running a function that it can't find (which makes sense as it looks like the process is being put to sleep by some other process(maybe OS?)). Here's the output from the gdb backtrace:
Code:
GNU gdb (GDB; openSUSE 11.1) 6.8.50.20081120-cvs
Copyright (C) 2008 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-suse-linux".
For bug reporting instructions, please see:
<http://bugs.opensuse.org/>.
Attaching to process 27973
Reading symbols from /home/miscem/dev/app_status/app_status...done.
Reading symbols from /lib64/librt.so.1...done.
Loaded symbols for /lib64/librt.so.1
Reading symbols from /lib64/libc.so.6...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/libpthread.so.0...done.
[Thread debugging using libthread_db enabled]
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/ld-linux-x86-64.so.2...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
0x00007f21e7089e6e in ?? () from /lib64/libc.so.6
(gdb) bt
#0 0x00007f21e7089e6e in ?? () from /lib64/libc.so.6
#1 0x00007f21e703e9ed in ?? () from /lib64/libc.so.6
#2 0x00007f21e703e7a6 in ?? () from /lib64/libc.so.6
#3 0x00007f21e703cd00 in ctime_r () from /lib64/libc.so.6
#4 0x00000000004020be in log_message (filename=0x6092cc "/var/log/app_status", message=0x7fffef7282c0 "child[27973]: 10.41.1.187: Recieved SIGUSR1, Max procs reached!?", t=101 'e')
at includes/cust_utils.h:223
#5 0x0000000000404c35 in signal_handler (sig=10) at includes/init_utils.h:561
#6 <signal handler called>
#7 0x00007f21e703c7b0 in ?? () from /lib64/libc.so.6
#8 0x00007f21e703e832 in ?? () from /lib64/libc.so.6
#9 0x00007f21e703cd00 in ctime_r () from /lib64/libc.so.6
#10 0x00000000004020be in log_message (filename=0x6092cc "/var/log/app_status", message=0x7fffef728c50 "child[27973]: 10.41.1.187: dispatcher: request: \"GET /app HTTP/1.0\"", t=100 'd')
at includes/cust_utils.h:223
#11 0x0000000000405f20 in dispatcher (clientfd=6, hostname=0x7fffef72a4c0 "10.41.1.187") at includes/http_utils.h:319
#12 0x0000000000406574 in main (argc=1, argv=0x7fffef72aab8) at app_status.c:151
(gdb)
One interesting thing I noticed, is that when I compiled it as a 32bit executable, it didn't have the problem nearly as often. But once it did the gdb backtrace showed this:
Code:
GNU gdb (GDB; openSUSE 11.1) 6.8.50.20081120-cvs
Copyright (C) 2008 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-suse-linux".
For bug reporting instructions, please see:
<http://bugs.opensuse.org/>.
Attaching to process 10695
Reading symbols from /home/miscem/dev/app_status/app_status...done.
Reading symbols from /lib/librt.so.1...done.
Loaded symbols for /lib/librt.so.1
Reading symbols from /lib/libc.so.6...done.
Loaded symbols for /lib/libc.so.6
Reading symbols from /lib/libpthread.so.0...done.
[Thread debugging using libthread_db enabled]
Loaded symbols for /lib/libpthread.so.0
Reading symbols from /lib/ld-linux.so.2...done.
Loaded symbols for /lib/ld-linux.so.2
0xffffe430 in __kernel_vsyscall ()
(gdb) bt
#0 0xffffe430 in __kernel_vsyscall ()
#1 0xf7e90e93 in ?? () from /lib/libc.so.6
#2 0xf7e3e44b in ?? () from /lib/libc.so.6
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb)
Which looks dubious. Also here is my modified code snippets...maybe I missed something or I'm not implementing the semaphores correctly??
Added this for the creation of the semaphores... I removed the int "updating" from the shared context struct I was using before and added two sem_t types to it called mod_sem, and lc_sem.
Code:
//--Create Semaphore(s)--//
if (sem_init(&scontext->mod_sem,1,1) == 0) {
if (sem_init(&scontext->lc_sem,1,0) != 0) {
sprintf(buf,"%s sem_init(lc_sem): %s",context.ident,strerror(errno));
log_message(context.log_file,buf,'e');
return 1;
}
}
else {
sprintf(buf,"%s sem_init(mod_sem): %s",context.ident,strerror(errno));
log_message(context.log_file,buf,'e');
return 1;
}
Here is the updated snippet showing the checking for the number of child processes, the fork, increment and decrement.
Code:
sem_wait(&scontext->mod_sem); //Enter CS
cur_children = scontext->children; //Get Copy of val
sem_post(&scontext->mod_sem); //Leave CS
if (strcasecmp(context.throttle_procs,"on") == 0 && cur_children+1 > context.int_throttle_procs_max) {
sprintf(buf,"%s Max processes(%d) reached, sending SIGUSR1 signal to children",context.ident,context.int_throttle_procs_max);
log_message(context.log_file,buf,'e');
kill(0,SIGUSR1);
sem_wait(&scontext->lc_sem); //wait for last child to signal no more children
}
if ((pid=fork()) >= 0) {
//**--This is the Child Process--**//
if (pid == 0) {
cpid=getpid();
sprintf(context.ident,"child[%d]:",cpid);
sem_wait(&scontext->mod_sem); //Enter CS
context.child=scontext->children+1;
sem_post(&scontext->mod_sem); //Leave CS
dispatcher(client_sockfd,hostname);
sprintf(buf,"%s process #%d exiting",context.ident,context.child);
log_message(context.log_file,buf,'d');
sem_wait(&scontext->mod_sem); //Enter CS
scontext->children--;
sem_post(&scontext->mod_sem); //Leave CS
det_shrd_context(scontext);
close(client_sockfd);
exit(0);
}
//**--This is the parent Process--**//
else {
sem_wait(&scontext->mod_sem); //Enter CS
scontext->children++;
sprintf(buf,"%s Sent to child(%d) process #%d",context.ident,pid,scontext->children);
sem_post(&scontext->mod_sem); //Leave CS
log_message(context.log_file,buf,'d');
close(client_sockfd);
}
}
And here is the function that gets called by the children on a SIGUSR1
Code:
void sig_handle_force_exit(void) {
char buf[KB];
struct shrd_context *scontext;
context.running = FALSE;
close_all_fd();
scontext = (struct shrd_context *)get_shrd_context(context.shm_id);
sem_wait(&scontext->mod_sem); //Enter CS
scontext->children--;
if (scontext->children == 0 ) {
sprintf(buf,"%s I'm the last child, signaling....",context.ident);
log_message(context.log_file,buf,'d');
sem_post(&scontext->lc_sem); //Signal daemon waiting for last child
}
//unlock critical section//
sem_post(&scontext->mod_sem); //Leave CS
det_shrd_context(scontext);
sprintf(buf,"%s process #%d forced exit.",context.ident,context.child);
log_message(context.log_file,buf,'w');
exit(-1);
}
Thanks again for taking time to help me as I learn these things. I've already learned a ton from whats been posted already.