8 bit quantized loading issues
#6
by
mmoya
- opened
Hello, I'm trying to load the following model using 8 bit precision
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("philschmid/flan-t5-xxl-sharded-fp16", load_in_8bit=True,device_map='auto')
but whenever I try loading using load_in_8bit
I run into the traceback below. I'm currently using a ml.p3.2xlarge
instance via Sagemaker. Would greatly appreciate any help on this
@philschmid
ValueError Traceback (most recent call last)
Cell In[134], line 3
1 from transformers import AutoModelForSeq2SeqLM
----> 3 model = AutoModelForSeq2SeqLM.from_pretrained("philschmid/flan-t5-xxl-sharded-fp16", load_in_8bit=True,device_map='auto')
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:471, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
469 elif type(config) in cls._model_mapping.keys():
470 model_class = _get_model_class(config, cls._model_mapping)
--> 471 return model_class.from_pretrained(
472 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
473 )
474 raise ValueError(
475 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
476 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
477 )
File ~/anaconda3/envs/python3/lib/python3.10/site-packages/transformers/modeling_utils.py:2591, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
2587 device_map_without_lm_head = {
2588 key: device_map[key] for key in device_map.keys() if key not in modules_to_not_convert
2589 }
2590 if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
-> 2591 raise ValueError(
2592 """
2593 Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
2594 the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
2595 these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
2596 `device_map` to `from_pretrained`. Check
2597 https://huggingface.co./docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
2598 for more details.
2599 """
2600 )
2601 del device_map_without_lm_head
2603 if from_tf:
ValueError:
Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
`device_map` to `from_pretrained`. Check
https://huggingface.co./docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
for more details.