restore_from_checkpoint_folder = restore_from_checkpoint_folder) File "/gpfsdswork/projects/rech/geh/uzq69ur/Challenge/Task_1/fets_challenge/experiment.py", line 290, in run_challenge_experiment task_runner = copy(plan).get_task_runner(collaborator_data_loaders[col]) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/plan/plan.py", line 389, in get_task_runner self.runner_ = Plan.build(**defaults) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/plan/plan.py", line 182, in build instance = getattr(module, class_name)(**settings) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/task/runner_fets_challenge.py", line 43, in __init__ model, optimizer, train_loader, val_loader, scheduler, params = create_pytorch_objects(fets_config_dict, train_csv=train_csv, val_csv=val_csv, device=device) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/GANDLF/compute/generic.py", line 78, in create_pytorch_objects model, amp=parameters["model"]["amp"], device=device, optimizer=optimizer File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/GANDLF/utils/tensor.py", line 159, in send_model_to_device model = nn.DataParallel(model, "[" + dev + "]") File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 138, in __init__ self.device_ids = [_get_device_index(x, True) for x in device_ids] File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 138, in self.device_ids = [_get_device_index(x, True) for x in device_ids] File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/_utils.py", line 479, in _get_device_index device = torch.device(device) RuntimeError: Invalid device string: '[' ``` Is it possible to use multiple GPUs with the proposed code for Task 1 in this Challenge? Am I doing something wrong? Thank you, Matthis. " />

Hi, As training is quite long, especially with the size of the training set compared to last year, I would like to use multiple GPUs. When ensuring that they are visible with ``` os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" device = "cuda" ``` I get the following error: ``` Traceback (most recent call last): File "./FeTS_Challenge.py", line 605, in restore_from_checkpoint_folder = restore_from_checkpoint_folder) File "/gpfsdswork/projects/rech/geh/uzq69ur/Challenge/Task_1/fets_challenge/experiment.py", line 290, in run_challenge_experiment task_runner = copy(plan).get_task_runner(collaborator_data_loaders[col]) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/plan/plan.py", line 389, in get_task_runner self.runner_ = Plan.build(**defaults) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/plan/plan.py", line 182, in build instance = getattr(module, class_name)(**settings) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/openfl/federated/task/runner_fets_challenge.py", line 43, in __init__ model, optimizer, train_loader, val_loader, scheduler, params = create_pytorch_objects(fets_config_dict, train_csv=train_csv, val_csv=val_csv, device=device) File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/GANDLF/compute/generic.py", line 78, in create_pytorch_objects model, amp=parameters["model"]["amp"], device=device, optimizer=optimizer File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/GANDLF/utils/tensor.py", line 159, in send_model_to_device model = nn.DataParallel(model, "[" + dev + "]") File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 138, in __init__ self.device_ids = [_get_device_index(x, True) for x in device_ids] File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 138, in self.device_ids = [_get_device_index(x, True) for x in device_ids] File "/linkhome/rech/gencre01/uzq69ur/.conda/envs/fets_2022/lib/python3.7/site-packages/torch/_utils.py", line 479, in _get_device_index device = torch.device(device) RuntimeError: Invalid device string: '[' ``` Is it possible to use multiple GPUs with the proposed code for Task 1 in this Challenge? Am I doing something wrong? Thank you, Matthis.

Created by Matthis Manthe Matthis
Hi Matthias, Unfortunately, the code does not currently support the use of multiple GPUs. Brandon

Training with multiple GPUs page is loading…