When a CPU WU is running and a GPU WU is stuck waiting for an assignment, the client asks for a WU that is fit for a system with 0 CPUs and one GPU. If there's nothing available for that GPU then this just repeats with exponential back-off. That is fine. But when the CPU WU finishes, the next request for a WU should indicate to the server that CPUs and a GPU is available. But it doesn't. It still says there are 0 CPU cores free. So it gets stuck indefinitely, asking for a GPU WU over and over and never a CPU WU. The only solution is to pause and unpause.
Here are the logs with log level 3 with debugging enabled. The log-domain levels:
Code: Select all
Group:d:1,App:d:0,SSLContext:d:5,Thread:i:5,RWLock:d:5,TarFileReader:d:5,Subprocess:d:0,SubprocessPool:d:5
Code: Select all
10:48:09:I1:TailFileToLog:WU330:Completed 245000 out of 250000 steps (98%)
10:48:58:I1:TailFileToLog:WU330:Completed 247500 out of 250000 steps (99%)
10:49:48:I1:TailFileToLog:WU330:Completed 250000 out of 250000 steps (100%)
10:49:49:I1:TailFileToLog:WU330:Saving result file ../logfile_01.txt
10:49:49:I1:TailFileToLog:WU330:Saving result file frame210.gro
10:49:49:I1:TailFileToLog:WU330:Saving result file frame210.xtc
10:49:49:I1:TailFileToLog:WU330:Saving result file md.log
10:49:49:I1:TailFileToLog:WU330:Saving result file science.log
10:49:49:I1:TailFileToLog:WU330:Saving result file state.cpt
10:49:49:I1:TailFileToLog:WU330:Folding@home Core Shutdown: FINISHED_UNIT
10:49:50:I1:Unit:WU330:Core returned FINISHED_UNIT (100)
10:49:50:D1:Group:Default:Remaining CPUs: 16, Remaining GPUs: 0, Active WUs: 1
10:49:50:I1:Unit:WU330:Uploading WU results
10:49:51:I1:Request:OUT29:> CONNECT highland1.seas.upenn.edu:443 HTTP/1.1
10:49:51:I1:Unit:WU330:UPLOAD 100% 85B of 85B
10:49:51:I1:Request:OUT29:> POST https://highland1.seas.upenn.edu/api/results HTTP/1.1
10:49:57:I1:Unit:WU330:UPLOAD 100% 18.80MiB of 18.80MiB
10:50:27:I1:Request:OUT29:< HTTP/1.1 200 HTTP_OK
10:50:27:I1:Unit:WU330:Credited
10:50:27:D3:Unit:WU330:Cleaning WU
10:50:27:D1:Group:Default:Remaining CPUs: 16, Remaining GPUs: 0, Active WUs: 1
10:53:12:I1:Unit:WU331:Requesting WU assignment for user Grimoire_of_Lolice team 230362
10:53:12:D3:Unit:WU331:{
10:53:12:D3:Unit:WU331: "time": "2025-04-11T10:53:12Z",
10:53:12:D3:Unit:WU331: "wu": 331,
10:53:12:D3:Unit:WU331: "version": "8.4.9",
10:53:12:D3:Unit:WU331: "id": "[removed]",
10:53:12:D3:Unit:WU331: "user": "Grimoire_of_Lolice",
10:53:12:D3:Unit:WU331: "team": 230362,
10:53:12:D3:Unit:WU331: "passkey": "[removed]",
10:53:12:D3:Unit:WU331: "os": {"version": "6.1", "type": "linux", "memory": 18570428416},
10:53:12:D3:Unit:WU331: "project": {"cause": "parkinsons", "beta": true},
10:53:12:D3:Unit:WU331: "resources": {
10:53:12:D3:Unit:WU331: "cpu": {"cpu": "amd64", "cpus": 0, "vendor": "AuthenticAMD", "signature": 10948417, "family": 25, "model": 116, "stepping": 1, "features": 9149117666224241663, "extended": 18119809652725673, "80000001": 8485406217078045695},
10:53:12:D3:Unit:WU331: "gpu:03:00:00": {
10:53:12:D3:Unit:WU331: "gpu": "amd",
10:53:12:D3:Unit:WU331: "vendor": 4098,
10:53:12:D3:Unit:WU331: "device": 5567,
10:53:12:D3:Unit:WU331: "opencl": {"platform": 0, "device": 0, "compute": "2.0", "driver": "3625.0"}
10:53:12:D3:Unit:WU331: }
10:53:12:D3:Unit:WU331: }
10:53:12:D3:Unit:WU331:}
10:53:12:I1:Request:OUT36:> CONNECT assign1.foldingathome.org:443 HTTP/1.1
10:53:13:I1:Request:OUT36:> POST https://assign1.foldingathome.org/api/assign HTTP/1.1
10:53:14:I1:Request:OUT36:< HTTP/1.1 503 HTTP_SERVICE_UNAVAILABLE
10:53:14:E :Request:OUT36:HTTP_SERVICE_UNAVAILABLE: {"error":"No appropriate assignment"}
10:53:14:I1:Unit:WU331:Retry #18 in 8 mins 32 secs
10:53:28:D3:Remote:msg: {"state": "pause", "cmd": "state", "time": "2025-04-11T10:53:28Z"}
10:53:28:I1:Groups:Machine state pause
10:53:28:D3:Unit:WU331:Cleaning WU
10:53:35:D3:Remote:msg: {"state": "fold", "cmd": "state", "time": "2025-04-11T10:53:35Z"}
10:53:35:I1:Groups:Machine state fold
10:53:35:D1:Group:Default:Remaining CPUs: 16, Remaining GPUs: 1, Active WUs: 0
10:53:35:I1:Group:Default:Added new work unit: cpus:16 gpus:gpu:03:00:00
10:53:35:D1:Group:Default:Remaining CPUs: 0, Remaining GPUs: 0, Active WUs: 1
10:53:35:I1:Unit:WU332:Requesting WU assignment for user Grimoire_of_Lolice team 230362
10:53:35:D3:Unit:WU332:{
10:53:35:D3:Unit:WU332: "time": "2025-04-11T10:53:35Z",
10:53:35:D3:Unit:WU332: "wu": 332,
10:53:35:D3:Unit:WU332: "version": "8.4.9",
10:53:35:D3:Unit:WU332: "id": "[removed]",
10:53:35:D3:Unit:WU332: "user": "Grimoire_of_Lolice",
10:53:35:D3:Unit:WU332: "team": 230362,
10:53:35:D3:Unit:WU332: "passkey": "[removed]",
10:53:35:D3:Unit:WU332: "os": {"version": "6.1", "type": "linux", "memory": 18399158272},
10:53:35:D3:Unit:WU332: "project": {"cause": "parkinsons", "beta": true},
10:53:35:D3:Unit:WU332: "resources": {
10:53:35:D3:Unit:WU332: "cpu": {"cpu": "amd64", "cpus": 16, "vendor": "AuthenticAMD", "signature": 10948417, "family": 25, "model": 116, "stepping": 1, "features": 9149117666224241663, "extended": 18119809652725673, "80000001": 8485406217078045695},
10:53:35:D3:Unit:WU332: "gpu:03:00:00": {
10:53:35:D3:Unit:WU332: "gpu": "amd",
10:53:35:D3:Unit:WU332: "vendor": 4098,
10:53:35:D3:Unit:WU332: "device": 5567,
10:53:35:D3:Unit:WU332: "opencl": {"platform": 0, "device": 0, "compute": "2.0", "driver": "3625.0"}
10:53:35:D3:Unit:WU332: }
10:53:35:D3:Unit:WU332: }
10:53:35:D3:Unit:WU332:}
10:53:35:I1:Request:OUT39:> CONNECT assign2.foldingathome.org:443 HTTP/1.1
10:53:35:I1:Request:OUT39:> POST https://assign2.foldingathome.org/api/assign HTTP/1.1
10:53:37:I1:Request:OUT39:< HTTP/1.1 200 HTTP_OK
10:53:37:I1:Unit:WU332:Received WU assignment rdgI9KPYZX_I5r9yCw9J-VdPg6LlkYE6IdLHQZbbZE4
10:53:37:D3:Unit:WU332:Received assignment for 16 cpus and 1 gpus
10:53:37:I1:Unit:WU332:Downloading WU
10:53:37:D1:Group:Default:Remaining CPUs: 0, Remaining GPUs: 0, Active WUs: 1
10:53:37:I1:Request:OUT40:> CONNECT vav21.fah.temple.edu:443 HTTP/1.1
10:53:38:I1:Request:OUT40:> POST https://vav21.fah.temple.edu/api/assign HTTP/1.1
10:53:40:I1:Unit:WU332:DOWNLOAD 20% 143.16KiB of 733.30KiB
10:53:41:I1:Unit:WU332:DOWNLOAD 68% 501.05KiB of 733.30KiB
10:53:42:I1:Request:OUT40:< HTTP/1.1 200 HTTP_OK
10:53:42:I1:Unit:WU332:Received WU P18496 R50 C3 G149
I'm trying to find out if anyone else has been bitten by this before or if there might be something odd about my setup. I lost about 14 hours of work because of this. Once the CPU WU finished and uploaded, it asked for a GPU WU over and over (always being denied which is fine), and never asking for a CPU WU until I noticed and paused/unpaused.