训练 xception 模型 keras - 批量大小 32 给出错误,但它适用于批量大小 = 16
下面是错误日志的详细信息,您可以帮助我吗?我猜测下面的内容是错误的关键,但无法弄清楚 OOM当使用 shape[728,728,1,1] 分配张量并通过分配器 GPU_0_bfc 在 /job:localhost/replica:0/task:0/device:GPU:0 上键入 float 时
ResourceExhaustedError Traceback (most recent call last)
Cell In[34], line 7
2 model_save = ModelCheckpoint('/kaggle/working/model_weights.keras' , monitor = 'val_loss', save_best_only = True, mode = 'min')
3 reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,
4 patience=4, min_lr=0.0001)
----> 7 history = model.fit(train_it, steps_per_epoch= steps_per_epoch, validation_data=val_it,
8 validation_steps=validation_steps, epochs = epochs, callbacks=[early_stopping, model_save, reduce_lr] )
File /opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /opt/conda/lib/python3.10/site-packages/tensorflow/python/eager/execute.py:52, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
50 try:
51 ctx.ensure_initialized()
---> 52 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
53 inputs, attrs, num_outputs)
54 except core._NotOkStatusException as e:
55 if name is not None:
ResourceExhaustedError: Graph execution error:
Detected at node 'model_1/block6_sepconv2/separable_conv2d' defined at (most recent call last):
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
app.launch_new_instance()
File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
app.start()
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 728, in start
self.io_loop.start()
File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
self.asyncio_loop.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
await self.process_one()
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
await dispatch(*args)
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
await result
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
reply_content = await reply_content
File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
res = shell.run_cell(
File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
return super().run_cell(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
result = self._run_cell(
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
result = runner(coro)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
coro.send(None)
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
if await self.run_code(code, result, async_=asy):
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/tmp/ipykernel_33/698136834.py", line 7, in <module>
history = model.fit(train_it, steps_per_epoch= steps_per_epoch, validation_data=val_it,
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
tmp_logs = self.train_function(iterator)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
return step_function(self, iterator)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step
outputs = model.train_step(data)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
y_pred = self(x, training=True)
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 558, in __call__
return super().__call__(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/functional.py", line 512, in call
return self._run_internal_graph(inputs, training=training, mask=mask)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/keras/layers/convolutional/separable_conv2d.py", line 188, in call
outputs = tf.compat.v1.nn.separable_conv2d(
Node: 'model_1/block6_sepconv2/separable_conv2d'
OOM when allocating tensor with shape[728,728,1,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node model_1/block6_sepconv2/separable_conv2d}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_39551]
除了减小批量大小之外别无选择(除非您想减小图像的大小或使用更小的架构)。
批大小不一定是 2 的幂,您可以尝试使用 batch_size 24,从 16 逐渐增加,直到达到限制/充分利用 GPU 内存。