Upload model

Browse files

Files changed (3) hide show

config.json +1 -0
configuration_t5mimo.py +2 -0
modeling_t5mimo.py +86 -95

config.json CHANGED Viewed

@@ -18,6 +18,7 @@
   "initializer_factor": 0.05,
   "is_encoder_decoder": true,
   "is_gated_act": false,
   "layer_norm_epsilon": 1e-06,
   "model_type": "t5mimo",
   "num_decoder_layers": 4,

   "initializer_factor": 0.05,
   "is_encoder_decoder": true,
   "is_gated_act": false,
+  "is_mimo": true,
   "layer_norm_epsilon": 1e-06,
   "model_type": "t5mimo",
   "num_decoder_layers": 4,

configuration_t5mimo.py CHANGED Viewed

@@ -81,6 +81,7 @@ class T5MIMOConfig(PretrainedConfig):
         classifier_dropout=0.0,
         num_seqs=3,
         num_filters=64,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -102,6 +103,7 @@ class T5MIMOConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.num_seqs = num_seqs
         self.num_filters = num_filters
         act_info = self.feed_forward_proj.split("-")
         self.dense_act_fn = act_info[-1]

         classifier_dropout=0.0,
         num_seqs=3,
         num_filters=64,
+        is_mimo=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.use_cache = use_cache
         self.num_seqs = num_seqs
         self.num_filters = num_filters
+        self.is_mimo = is_mimo
         act_info = self.feed_forward_proj.split("-")
         self.dense_act_fn = act_info[-1]

modeling_t5mimo.py CHANGED Viewed

@@ -198,8 +198,9 @@ class T5Attention(nn.Module):
         self.d_model = config.d_model
         self.key_value_proj_dim = config.d_kv
         self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
         self.inner_dim = self.n_heads * self.key_value_proj_dim
         # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
@@ -276,7 +277,7 @@ class T5Attention(nn.Module):
         relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
         return relative_buckets
-    def compute_bias(self, query_length, key_length,multivar_dim=-1, device=None):
         """Compute binned relative position bias"""
         if device is None:
             device = self.relative_attention_bias.weight.device
@@ -291,9 +292,8 @@ class T5Attention(nn.Module):
         )
         values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        if multivar_dim !=-1: # shape (1, multivar_dim, num_heads, query_length, key_length) (copy across)
-            values = values.expand(1, multivar_dim, -1, -1, -1)
         return values
     def forward(
@@ -314,42 +314,41 @@ class T5Attention(nn.Module):
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        if len(hidden_states.shape) == 3:
-            batch_size, seq_length = hidden_states.shape[:2]
         else:
-            batch_size, seq_length = hidden_states.shape[0],hidden_states.shape[2]
-            multivar_dim = hidden_states.shape[1]
         real_seq_length = seq_length
         if past_key_value is not None:
             if len(past_key_value) != 2:
-                raise ValueError(
-                    f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-                )
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-        if len(hidden_states.shape) == 3:
-            key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-        else:
             key_length = real_seq_length if key_value_states is None else key_value_states.shape[2]
         def shape(states):
             """projection"""
-            # states: torch.Size([3, 16, 512]) -> query_states: torch.Size([3, 8, 16, 64])
-            # states: torch.Size([3, 6, 16, 512]) -> query_states: torch.Size([3, 6, 8 , 16, 64])
-            if len(states.shape) == 3:
-                return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-            else:
                 return states.view(batch_size, multivar_dim, -1, self.n_heads, self.key_value_proj_dim).transpose(2, 3)
         def unshape(states):
             """reshape"""
-            if len(states.shape) == 4:
-                return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-            else:
                 return states.transpose(2, 3).contiguous().view(batch_size, multivar_dim, -1, self.inner_dim)
         def project(hidden_states, proj_layer, key_value_states, past_key_value):
             """projects hidden states correctly to key/query states"""
@@ -361,12 +360,14 @@ class T5Attention(nn.Module):
                 # cross-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states))
             if past_key_value is not None:
                 if key_value_states is None:
                     # self-attn
                     # (batch_size, n_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
                 elif past_key_value.shape[2] != key_value_states.shape[1]:
                     # checking that the `sequence_length` of the `past_key_value` is the same as
                     # the provided `key_value_states` to support prefix tuning
@@ -393,14 +394,10 @@ class T5Attention(nn.Module):
         # compute scores
-        if len(hidden_states.shape) == 3:
-            scores = torch.matmul(
-                query_states, key_states.transpose(3, 2)
-            )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
         else:
-            scores = torch.matmul(
-                query_states, key_states.transpose(4, 3)
-            )
@@ -408,28 +405,22 @@ class T5Attention(nn.Module):
         if position_bias is None:
             if not self.has_relative_attention_bias:
-                if len(hidden_states.shape) == 3:
-                    position_bias = torch.zeros(
-                        (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                    )
                 else:
-                    position_bias = torch.zeros(
-                        (1,multivar_dim, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                    )
                 if self.gradient_checkpointing and self.training:
                     position_bias.requires_grad = True
             else:
-                if len(hidden_states.shape) == 3:
-                    position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
-                else:
-                    position_bias = self.compute_bias(real_seq_length, key_length,multivar_dim=multivar_dim, device=scores.device)
             # if key and values are already calculated
             # we want only the last query position bias
             if past_key_value is not None:
-                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
             if mask is not None:
                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
@@ -443,24 +434,16 @@ class T5Attention(nn.Module):
         else:
             position_bias_masked = position_bias
         scores += position_bias_masked
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
         # Mask heads if we want to
         if layer_head_mask is not None:
             attn_weights = attn_weights * layer_head_mask
-        if len(hidden_states.shape) == 3:
-            attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
-        else:
-            attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, multivar_dim, seq_length, dim)
         attn_output = self.o(attn_output)
@@ -526,7 +509,6 @@ class T5LayerCrossAttention(nn.Module):
         query_length=None,
         output_attentions=False,
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(
             normed_hidden_states,
@@ -555,6 +537,8 @@ class T5Block(nn.Module):
         self.layer.append(T5LayerFF(config))
     def forward(
         self,
         hidden_states,
@@ -613,7 +597,10 @@ class T5Block(nn.Module):
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
             if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
             else:
                 query_length = None
@@ -885,19 +872,14 @@ class T5Stack(T5PreTrainedModel):
             self.embed_tokens = self.embed_tokens.to(self.first_device)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
-            )
         elif input_ids is not None:
             input_shape = input_ids.size()
-            # input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
@@ -909,13 +891,16 @@ class T5Stack(T5PreTrainedModel):
                 raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
-        if len(input_shape) == 3:
             batch_size, multivar_seqs ,seq_length = input_shape
         else:
             batch_size, seq_length = input_shape
         # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
         if use_cache is True:
             if not self.is_decoder:
@@ -926,45 +911,34 @@ class T5Stack(T5PreTrainedModel):
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
-            if len(input_shape) == 2:
-                attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-            else:
-                attention_mask = torch.ones(batch_size, multivar_seqs, mask_seq_length, device=inputs_embeds.device)
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
-        if len(input_shape) == 2:
-            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         else:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-            # permute from [batch_size, 1, multivar_seqs, seq_length] to [batch_size, multivar_seqs, 1, seq_length]
-            extended_attention_mask = extended_attention_mask.permute(0, 2, 1, 3)
-            # Now make it [batch_size, multivar_seqs, 1, 1, seq_length]
-            extended_attention_mask = extended_attention_mask.unsqueeze(3)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
-            if len(encoder_hidden_states.size()) == 3 :
-                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            else:
                 encoder_batch_size, multivar_dem, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
-                )
-            if len(input_shape) == 2:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
             else:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-                multivar_dim = extended_attention_mask.shape[1]
-                encoder_extended_attention_mask = encoder_extended_attention_mask.unsqueeze(1)
-                encoder_extended_attention_mask = encoder_extended_attention_mask.permute(0, 3, 1, 2, 4)
         else:
             encoder_extended_attention_mask = None
@@ -973,9 +947,7 @@ class T5Stack(T5PreTrainedModel):
         if self.gradient_checkpointing and self.training:
             if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
                 use_cache = False
         # Prepare head mask if needed
@@ -1453,6 +1425,8 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         >>> # studies have shown that owning a dog is good for you.
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1461,6 +1435,8 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
             if self.config.num_layers == self.config.num_decoder_layers:
                 decoder_head_mask = head_mask
         # Encode if needed (training, first prediction pass)
         if encoder_outputs is None:
             # Convert encoder inputs in embeddings if needed
@@ -1500,6 +1476,15 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
             if decoder_attention_mask is not None:
                 decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
         # Decode
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
@@ -1518,6 +1503,7 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
         sequence_output = decoder_outputs[0]
         if use_conv:
             sequence_output = self.conv_block(sequence_output)
@@ -1548,8 +1534,11 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
         if not return_dict:
             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
             return ((loss,) + output) if loss is not None else output
-        return Seq2SeqLMOutput(
             loss=loss,
             logits=lm_logits,
             past_key_values=decoder_outputs.past_key_values,
@@ -1560,6 +1549,7 @@ class T5MIMOForConditionalGeneration(T5PreTrainedModel):
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self,
@@ -1640,6 +1630,7 @@ class T5MIMOEncoderModel(T5PreTrainedModel):
     def __init__(self, config: T5MIMOConfig):
         super().__init__(config)
         self.shared = nn.Embedding(config.vocab_size, config.d_model)
         encoder_config = copy.deepcopy(config)

         self.d_model = config.d_model
         self.key_value_proj_dim = config.d_kv
         self.n_heads = config.num_heads
         self.inner_dim = self.n_heads * self.key_value_proj_dim
+        self.dropout = config.dropout_rate
+        self.config = config
         # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
         relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
         return relative_buckets
+    def compute_bias(self, query_length, key_length, device=None):
         """Compute binned relative position bias"""
         if device is None:
             device = self.relative_attention_bias.weight.device
         )
         values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        if self.config.is_mimo:
+            values = values.unsqueeze(0)# shape (1, 1, num_heads, query_length, key_length)
         return values
     def forward(
         # Input is (batch_size, seq_length, dim)
         # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
         # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        if self.config.is_mimo:
+            batch_size, multivar_dim, seq_length = hidden_states.shape[:3]
         else:
+            batch_size, seq_length = hidden_states.shape[:2]
         real_seq_length = seq_length
         if past_key_value is not None:
             if len(past_key_value) != 2:
+                raise ValueError(f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states")
+            if self.config.is_mimo:
+                real_seq_length += past_key_value[0].shape[3] if query_length is None else query_length
+            else:
+                real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+        if self.config.is_mimo:
             key_length = real_seq_length if key_value_states is None else key_value_states.shape[2]
+        else:
+            key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
         def shape(states):
             """projection"""
+            if self.config.is_mimo:
                 return states.view(batch_size, multivar_dim, -1, self.n_heads, self.key_value_proj_dim).transpose(2, 3)
+            else:
+                return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
         def unshape(states):
             """reshape"""
+            if self.config.is_mimo:
                 return states.transpose(2, 3).contiguous().view(batch_size, multivar_dim, -1, self.inner_dim)
+            else:
+                return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
         def project(hidden_states, proj_layer, key_value_states, past_key_value):
             """projects hidden states correctly to key/query states"""
                 # cross-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states))
             if past_key_value is not None:
                 if key_value_states is None:
                     # self-attn
                     # (batch_size, n_heads, key_length, dim_per_head)
+                    if self.config.is_mimo:
+                        hidden_states = torch.cat([past_key_value, hidden_states], dim=3)
+                    else:
+                        hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
                 elif past_key_value.shape[2] != key_value_states.shape[1]:
                     # checking that the `sequence_length` of the `past_key_value` is the same as
                     # the provided `key_value_states` to support prefix tuning
         # compute scores
+        if self.config.is_mimo:
+            scores = torch.matmul(query_states, key_states.transpose(4, 3))
         else:
+            scores = torch.matmul(query_states, key_states.transpose(3, 2))  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
         if position_bias is None:
             if not self.has_relative_attention_bias:
+                if self.config.is_mimo:
+                    position_bias = torch.zeros((1,1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype)
                 else:
+                    position_bias = torch.zeros((1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype)
                 if self.gradient_checkpointing and self.training:
                     position_bias.requires_grad = True
             else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
             # if key and values are already calculated
             # we want only the last query position bias
             if past_key_value is not None:
+                if self.config.is_mimo:
+                    position_bias = position_bias[:, :, :, -hidden_states.size(2) :, :]
+                else:
+                    position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
             if mask is not None:
                 position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
         else:
             position_bias_masked = position_bias
         scores += position_bias_masked
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)  # (batch_size, n_heads, seq_length, key_length)
         # Mask heads if we want to
         if layer_head_mask is not None:
             attn_weights = attn_weights * layer_head_mask
+        attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
         attn_output = self.o(attn_output)
         query_length=None,
         output_attentions=False,
     ):
         normed_hidden_states = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(
             normed_hidden_states,
         self.layer.append(T5LayerFF(config))
+        self.config = config
     def forward(
         self,
         hidden_states,
             # the actual query length is unknown for cross attention
             # if using past key value states. Need to inject it here
             if present_key_value_state is not None:
+                if self.config.is_mimo:
+                    query_length = present_key_value_state[0].shape[3]
+                else:
+                    query_length = present_key_value_state[0].shape[2]
             else:
                 query_length = None
             self.embed_tokens = self.embed_tokens.to(self.first_device)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
                 raise ValueError("You have to initialize the model with valid token embeddings")
             inputs_embeds = self.embed_tokens(input_ids)
+        if self.config.is_mimo:
             batch_size, multivar_seqs ,seq_length = input_shape
         else:
             batch_size, seq_length = input_shape
         # required mask seq length can be calculated via length of past
+        if self.config.is_mimo:
+            mask_seq_length = past_key_values[0][0].shape[3] + seq_length if past_key_values is not None else seq_length
+        else:
+            mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
         if use_cache is True:
             if not self.is_decoder:
             past_key_values = [None] * len(self.block)
         if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
+        if self.config.is_mimo:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, (input_shape[0], input_shape[2]))
+            extended_attention_mask = extended_attention_mask.unsqueeze(1)
         else:
             extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.is_decoder and encoder_hidden_states is not None:
+            if self.config.is_mimo:
                 encoder_batch_size, multivar_dem, encoder_sequence_length, _ = encoder_hidden_states.size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long)
+            if self.config.is_mimo:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+                encoder_extended_attention_mask = encoder_extended_attention_mask.unsqueeze(1)
             else:
                 encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
         if self.gradient_checkpointing and self.training:
             if use_cache:
+                logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False
         # Prepare head mask if needed
         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         >>> # studies have shown that owning a dog is good for you.
         ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
             if self.config.num_layers == self.config.num_decoder_layers:
                 decoder_head_mask = head_mask
         # Encode if needed (training, first prediction pass)
         if encoder_outputs is None:
             # Convert encoder inputs in embeddings if needed
             if decoder_attention_mask is not None:
                 decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+        if hidden_states is not None and decoder_input_ids is not None:
+            if len(hidden_states.shape) == 4:
+                batch_size, multivar_seqs, seq_length , model_dim = hidden_states.shape
+                if len(decoder_input_ids.shape) == 2:
+                    decoder_input_ids = decoder_input_ids.unsqueeze(1).repeat(1, multivar_seqs, 1)
         # Decode
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
         sequence_output = decoder_outputs[0]
         if use_conv:
             sequence_output = self.conv_block(sequence_output)
         if not return_dict:
             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
             return ((loss,) + output) if loss is not None else output
+        seq2seqlmoutput =  Seq2SeqLMOutput(
             loss=loss,
             logits=lm_logits,
             past_key_values=decoder_outputs.past_key_values,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
+        return seq2seqlmoutput
     def prepare_inputs_for_generation(
         self,
     def __init__(self, config: T5MIMOConfig):
         super().__init__(config)
         self.shared = nn.Embedding(config.vocab_size, config.d_model)
         encoder_config = copy.deepcopy(config)