Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expected scalar type Float but found Half #4

Open
xzdong-2019 opened this issue May 15, 2023 · 1 comment
Open

expected scalar type Float but found Half #4

xzdong-2019 opened this issue May 15, 2023 · 1 comment

Comments

@xzdong-2019
Copy link

hello,
when i python api.py and test. i have a problem:
RuntimeError: expected scalar type Float but found Half

======================================
----> 1 model.generate(instruction=item["instruction"], input=item["input"])

File /data/dongxz/research/alpaca-7b-chinese/serve/model.py:139, in ModelServe.generate(self, instruction, input, temperature, top_p, top_k, num_beams, max_new_tokens, **kwargs)
137 print("generating...")
138 with torch.no_grad():
--> 139 generation_output = self.model.generate(
140 input_ids=input_ids,
141 generation_config=generation_config,
142 return_dict_in_generate=True,
143 output_scores=True,
144 max_new_tokens=max_new_tokens,
145 )
146 s = generation_output.sequences[0]
147 output = self.tokenizer.decode(s)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/peft/peft_model.py:731, in PeftModelForCausalLM.generate(self, **kwargs)
729 try:
730 if not isinstance(peft_config, PromptLearningConfig):
--> 731 outputs = self.base_model.generate(**kwargs)
732 else:
733 if "input_ids" not in kwargs:

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/generation/utils.py:1524, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, streamer, **kwargs)
1517 input_ids, model_kwargs = self._expand_inputs_for_generation(
1518 input_ids=input_ids,
1519 expand_size=generation_config.num_beams,
1520 is_encoder_decoder=self.config.is_encoder_decoder,
1521 **model_kwargs,
1522 )
1523 # 13. run beam search
-> 1524 return self.beam_search(
1525 input_ids,
1526 beam_scorer,
1527 logits_processor=logits_processor,
1528 stopping_criteria=stopping_criteria,
1529 pad_token_id=generation_config.pad_token_id,
1530 eos_token_id=generation_config.eos_token_id,
1531 output_scores=generation_config.output_scores,
1532 return_dict_in_generate=generation_config.return_dict_in_generate,
1533 synced_gpus=synced_gpus,
1534 **model_kwargs,
1535 )
1537 elif is_beam_sample_gen_mode:
1538 # 11. prepare logits warper
1539 logits_warper = self._get_logits_warper(generation_config)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/generation/utils.py:2810, in GenerationMixin.beam_search(self, input_ids, beam_scorer, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
2806 break
2808 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-> 2810 outputs = self(
2811 **model_inputs,
2812 return_dict=True,
2813 output_attentions=output_attentions,
2814 output_hidden_states=output_hidden_states,
2815 )
2817 if synced_gpus and this_peer_finished:
2818 cur_len = cur_len + 1

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:687, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
684 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
686 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 687 outputs = self.model(
688 input_ids=input_ids,
689 attention_mask=attention_mask,
690 position_ids=position_ids,
691 past_key_values=past_key_values,
692 inputs_embeds=inputs_embeds,
693 use_cache=use_cache,
694 output_attentions=output_attentions,
695 output_hidden_states=output_hidden_states,
696 return_dict=return_dict,
697 )
699 hidden_states = outputs[0]
700 logits = self.lm_head(hidden_states)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:577, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
569 layer_outputs = torch.utils.checkpoint.checkpoint(
570 create_custom_forward(decoder_layer),
571 hidden_states,
(...)
574 None,
575 )
576 else:
--> 577 layer_outputs = decoder_layer(
578 hidden_states,
579 attention_mask=attention_mask,
580 position_ids=position_ids,
581 past_key_value=past_key_value,
582 output_attentions=output_attentions,
583 use_cache=use_cache,
584 )
586 hidden_states = layer_outputs[0]
588 if use_cache:

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:292, in LlamaDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
289 hidden_states = self.input_layernorm(hidden_states)
291 # Self Attention
--> 292 hidden_states, self_attn_weights, present_key_value = self.self_attn(
293 hidden_states=hidden_states,
294 attention_mask=attention_mask,
295 position_ids=position_ids,
296 past_key_value=past_key_value,
297 output_attentions=output_attentions,
298 use_cache=use_cache,
299 )
300 hidden_states = residual + hidden_states
302 # Fully Connected

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py:196, in LlamaAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
185 def forward(
186 self,
187 hidden_states: torch.Tensor,
(...)
192 use_cache: bool = False,
193 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
194 bsz, q_len, _ = hidden_states.size()
--> 196 query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
197 key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
198 value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/peft/tuners/lora.py:710, in Linear8bitLt.forward(self, x)
706 if x.dtype != torch.float32:
707 x = x.float()
708 output = (
709 self.lora_B[self.active_adapter](
--> 710 self.lora_Aself.active_adapter
711 ).to(expected_dtype)
712 * self.scaling[self.active_adapter]
713 )
714 else:
715 output = (
716 self.lora_B[self.active_adapter](
717 self.lora_Aself.active_adapter
718 )
719 * self.scaling[self.active_adapter]
720 )

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /data/anaconda3/envs/dongxz_chatglm_v1/lib/python3.9/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)

RuntimeError: expected scalar type Float but found Half

@jeffreynghm
Copy link

I guess it's due to your anaconda. have you run it on without anaconda?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants