Browse Source

Pool handling enhancement + bugfix in worker sighandling

The responder_loop handle better child creation & deletion. A conf struct
member indicates the number of seconds the pool have to be idle before deleting
a worker.
In a same way, the pool has to be busy for at least 1s before a new
process can be added (the -f flag indicate if the counter has to be reseted
when a new process is created, allowing spawn burst).
The spawn() function call sigaction 2 times for SIGINT & SIGTERM in order
to catch both.
Yann Weber 4 years ago
parent
commit
710ab8281c
1 changed files with 86 additions and 88 deletions
  1. 86
    88
      src/responder.c

+ 86
- 88
src/responder.c View File

@@ -37,8 +37,10 @@ int responder_loop()
37 37
 	struct timespec timeout;
38 38
 	/**@brief watchdog timeout */
39 39
 	struct timespec pool_timeout;
40
-	short idle;
40
+	time_t idle_start, busy_start;
41
+	short idle, busy;
41 42
 	struct sigaction act;
43
+	char *statusstr;
42 44
 
43 45
 	act.sa_handler = pool_sighandler;
44 46
 	sigemptyset(&act.sa_mask);
@@ -57,7 +59,7 @@ int responder_loop()
57 59
 	sop.sem_flg = 0;
58 60
 	timeout.tv_sec = 0;
59 61
 	timeout.tv_nsec = 100000000;
60
-	idle = 0;
62
+	idle = busy = 0;
61 63
 
62 64
 	pyfcgi_logger_set_ident("Workpool");
63 65
 
@@ -126,53 +128,32 @@ int responder_loop()
126 128
 				       ret);
127 129
 				continue;
128 130
 			}
129
-			idle=0;
130
-			sop.sem_op = -1;
131
-			ret = semtimedop(semid, &sop, 1, &timeout);
132
-			sop.sem_op = 0;
133
-			if(ret < 0)
134
-			{
135
-				err = errno;
136
-				if(err != EAGAIN) //can fail if wrokers timeout
137
-				{
138
-					pyfcgi_log(LOG_ALERT,
139
-					       "Unable to dec sem after child exit : %s",
140
-					       strerror(err));
141
-					clean_exit(err);
142
-				}
143
-			}
144
-			if(status)
131
+			if(WIFSIGNALED(status))
145 132
 			{
146
-				if(WIFSIGNALED(status))
147
-				{
148
-					if(WTERMSIG(status) == 11)
149
-					{
150
-						pyfcgi_log(LOG_ALERT,
151
-							"Worker[%d] segfault !",
152
-							n);
153
-					}
154
-					else
155
-					{
156
-						pyfcgi_log(LOG_ALERT,
157
-							"Worker[%d] terminated by signal %d",
158
-							n, WTERMSIG(status));
159
-					}
160
-				}
161
-				if(WEXITSTATUS(status) & PYFCGI_FATAL)
133
+				if(WTERMSIG(status) == 11)
162 134
 				{
163 135
 					pyfcgi_log(LOG_ALERT,
164
-						"Worker[%d] exited with status FATAL",
136
+						"Worker[%d] segfault !",
165 137
 						n);
166
-					//TODO : restart ?
167 138
 				}
168 139
 				else
169 140
 				{
170
-					pyfcgi_log(LOG_WARNING,
171
-						"Worker[%d] exited with status %d",
172
-						n, WEXITSTATUS(status));
141
+					pyfcgi_log(LOG_ALERT,
142
+						"Worker[%d] terminated by signal %s(%d)",
143
+						n, strsignal(WTERMSIG(status)),
144
+						WTERMSIG(status));
173 145
 				}
174 146
 			}
175
-			else
147
+			if(WEXITSTATUS(status))
148
+			{
149
+				statusstr = status2str(WEXITSTATUS(status));
150
+				pyfcgi_log((WEXITSTATUS(status)&PYFCGI_FATAL)?
151
+						LOG_ALERT:LOG_WARNING,
152
+					"Worker[%d] exited with status %s",
153
+					n, statusstr);
154
+				free(statusstr);
155
+			}
156
+			if(!status)
176 157
 			{
177 158
 				pyfcgi_log(LOG_INFO,
178 159
 				       "Worker[%d] PID %d exited normally",
@@ -180,73 +161,88 @@ int responder_loop()
180 161
 			}
181 162
 
182 163
 			// respawn on same slot
183
-			pyfcgi_log(LOG_INFO, "respawn #%d", n);
164
+			pyfcgi_log(LOG_DEBUG, "respawning worker #%d", n);
184 165
 			wrk_pids[n] = spawn(n);
185 166
 
186
-			
187
-		}
188
-		// Stopping & deleting useless childs
189
-		if(wanted_n < n_wrk)
190
-		{	// need to shift the list and dec n_wrk
191
-			pyfcgi_log(LOG_DEBUG, "GC Workers");
192
-			n_wrk--;
193
-			kill(wrk_pids[n_wrk], SIGTERM);
194
-			nanosleep(&timeout, NULL);
195
-			kill(wrk_pids[n_wrk], SIGKILL);
196
-			nanosleep(&timeout, NULL);
197
-			if( (ret = waitpid(wrk_pids[n_wrk], &status, WNOHANG)) < 0 )
198
-			{
199
-				pyfcgi_log(LOG_ERR, "Unable to kill child %d (PID %d)",
200
-					n_wrk, wrk_pids[n_wrk]);
201
-			}
202
-			else
203
-			{
204
-				pyfcgi_log(LOG_INFO, "worker[%d](%d) killed",
205
-					n_wrk, wrk_pids[n_wrk]);
206
-			}
207 167
 			continue;
208 168
 		}
209 169
 
170
+		// Check if the pool is idle or busy
210 171
 		ret = semtimedop(semid, &sop, 1, &timeout);
211
-//pyfcgi_log( LOG_DEBUG, "semtimeop ret=%d want %d have %d", ret, wanted_n, n_wrk);
212 172
 		if(ret < 0)
213 173
 		{
214 174
 			err = errno;
215
-			if(err == EAGAIN)
175
+			if(err != EAGAIN)
216 176
 			{
217
-//pyfcgi_log(LOG_DEBUG, "IDLE want %d have %d\t min=%d", wanted_n, n_wrk, min_wrk);
218
-				// workers idle
219
-				if(!idle)
220
-				{
221
-					idle = 1;
222
-				}
223
-				else if(wanted_n > PyFCGI_conf.min_wrk
224
-					&& n_wrk - wanted_n < 2)
225
-				{
226
-					wanted_n--;
227
-				}
228
-				continue;
177
+				pyfcgi_log(LOG_ERR, "Unable to read semaphore : %s",
178
+				       strerror(err));
179
+				exit(PYFCGI_FATAL);
180
+			}
181
+			// workers idle
182
+			busy = 0;
183
+			if(!idle)
184
+			{
185
+				idle = 1;
186
+				idle_start = time(NULL);	
187
+			}
188
+			else if((time(NULL) - idle_start) > PyFCGI_conf.worker_gc_timeout &&
189
+				wanted_n > PyFCGI_conf.min_wrk
190
+				&& n_wrk - wanted_n < 2)
191
+			{
192
+				wanted_n--;
193
+				idle = 0;
229 194
 			}
230
-			pyfcgi_log(LOG_ERR, "Unable to read semaphore : %s",
231
-			       strerror(err));
232 195
 		}
233
-		if(!ret)
196
+		else if(!ret)
234 197
 		{
235
-			if(n_wrk < PyFCGI_conf.max_wrk)
198
+			idle = 0;
199
+			if(!busy)
200
+			{
201
+				busy = 1;
202
+				busy_start = time(NULL);
203
+			}
204
+			else if(time(NULL) - busy_start > 0 &&
205
+				wanted_n <= PyFCGI_conf.max_wrk)
236 206
 			{
237
-				idle=0;
238 207
 				pyfcgi_log( LOG_DEBUG,
239 208
 					"All workers busy, spawning a new one");
240 209
 				n = n_wrk;
241 210
 				n_wrk++;
242 211
 				wanted_n = n_wrk;
243 212
 				wrk_pids[n] = spawn(n);
213
+				if(!PyFCGI_conf.worker_fast_spawn)
214
+				{
215
+					busy_start = time(NULL);
216
+				}
217
+			}
218
+		}
219
+
220
+		// Stopping & deleting useless childs
221
+		if(wanted_n < n_wrk && idle)
222
+		{	// need to shift the list and dec n_wrk
223
+			busy = 0;
224
+			n_wrk--;
225
+			kill(wrk_pids[n_wrk], SIGTERM);
226
+			nanosleep(&timeout, NULL);
227
+			if( (ret = waitpid(wrk_pids[n_wrk], &status, WNOHANG)) < 0 )
228
+			{
229
+				pyfcgi_log(LOG_ERR, "Pool idle since %ds but unable to kill child %d (PID %d)",
230
+					PyFCGI_conf.worker_gc_timeout,
231
+					n_wrk, wrk_pids[n_wrk]);
232
+				kill(wrk_pids[n_wrk], SIGKILL);
244 233
 			}
245 234
 			else
246 235
 			{
247
-				nanosleep(&timeout, NULL);
236
+				pyfcgi_log(LOG_INFO, "Pool idle since %ds : worker[%d](%d) killed",
237
+					PyFCGI_conf.worker_gc_timeout,
238
+					n_wrk, wrk_pids[n_wrk]);
248 239
 			}
240
+			idle = 0;
241
+			continue;
249 242
 		}
243
+
244
+
245
+		nanosleep(&timeout, NULL);
250 246
 	}
251 247
 	
252 248
 	pyfcgi_wd_arm();
@@ -267,17 +263,12 @@ int responder_loop()
267 263
 pid_t spawn(int wrk_id)
268 264
 {
269 265
 	pid_t res;
270
-	struct timespec timeout;
271 266
 	struct timespec wd_timeout;
272 267
 	struct sigaction act;
273 268
 	char ident[128];
274 269
 
275
-	timeout.tv_sec = 0;
276
-	timeout.tv_nsec = 100000000;
277
-
278 270
 	act.sa_handler = worker_sighandler;
279 271
 	sigemptyset(&act.sa_mask);
280
-	sigaddset(&act.sa_mask, SIGTERM);
281 272
 	act.sa_flags = 0;
282 273
 	act.sa_restorer = NULL;
283 274
 
@@ -292,6 +283,8 @@ pid_t spawn(int wrk_id)
292 283
 	else if(!res)
293 284
 	{
294 285
 		// Child process
286
+		PyFCGI_conf.context.ppid = PyFCGI_conf.context.pid;
287
+		PyFCGI_conf.context.pid = getpid();
295 288
 		snprintf(ident, 128, "Worker%2d", wrk_id);
296 289
 		pyfcgi_logger_set_ident(ident);
297 290
 		// Set handler for SIGINT & SIGTERM
@@ -300,6 +293,11 @@ pid_t spawn(int wrk_id)
300 293
 			perror("Sigaction error for pool process");
301 294
 			exit(PYFCGI_FATAL);
302 295
 		}
296
+		if(sigaction(SIGTERM, &act, NULL))
297
+		{
298
+			perror("Sigaction2 error for pool process");
299
+			exit(PYFCGI_FATAL);
300
+		}
303 301
 		// Set watchdog
304 302
 		if(PyFCGI_conf.worker_timeout)
305 303
 		{
@@ -320,7 +318,7 @@ pid_t spawn(int wrk_id)
320 318
 	// Sleep to avoid spawning like hell thinking all workers are
321 319
 	// busy. Let some time to this one to go up...
322 320
 	// TODO: find a better way to avoid spawning to max_wrk
323
-	nanosleep(&timeout, NULL);
321
+	//nanosleep(&timeout, NULL);
324 322
 	pyfcgi_log(	LOG_INFO,
325 323
 		"Worker #%d spawned with PID %d", wrk_id, res);
326 324
 	return res;

Loading…
Cancel
Save