Spaces:

KingNish
/

Bagel-7B-Demo

Paused

App Files Files Community

KingNish commited on May 25

Commit

c40e1ba

verified ·

1 Parent(s): 050df43

Update modeling/bagel/bagel.py

Browse files

Files changed (1) hide show

modeling/bagel/bagel.py +9 -22

modeling/bagel/bagel.py CHANGED Viewed

@@ -897,13 +897,9 @@ class Bagel(PreTrainedModel):
         the behavior of the original batch generation function, including the handling
         of start tokens and the end-of-sequence token.
         """
         curr_tokens = packed_start_tokens
-        for _ in range(max_length):
-            # The original function would append `curr_tokens` to a list at this point.
-            # Instead, we yield it to the caller, enabling streaming.
-            yield curr_tokens
             packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens)
             query_lens = torch.ones_like(curr_tokens)
             packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange(
@@ -912,9 +908,6 @@ class Bagel(PreTrainedModel):
                 dtype=key_values_lens.dtype
             )
-            # This block modifies packed_key_value_indexes before the forward pass,
-            # preserving the specific logic for NaViT-style packed inputs.
-            # The typo 'uppacked' is kept to match the original source code.
             uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
             for i in range(len(uppacked)):
                 uppacked[i] += i
@@ -940,20 +933,12 @@ class Bagel(PreTrainedModel):
             packed_query_sequence = output.packed_query_sequence
             pred_logits = self.language_model.lm_head(packed_query_sequence)
-            # Sample the next token
             if do_sample:
                 probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
             else:
-                next_tokens = torch.argmax(pred_logits, dim=-1)
-            # The stop condition is checked on the newly generated token. If it's the
-            # end token, we break the loop. This token will not be yielded.
-            if end_token_id is not None and next_tokens[0] == end_token_id: # only support batch=1
-                break
-            # This block updates the state variables for the next iteration. It reads
-            # the already-modified `packed_key_value_indexes` and updates it further.
             uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
             for i in range(len(uppacked)):
                 uppacked[i] = torch.cat(
@@ -962,10 +947,12 @@ class Bagel(PreTrainedModel):
             packed_key_value_indexes = torch.cat(uppacked, dim=0)
             key_values_lens = key_values_lens + 1
             packed_query_position_ids = packed_query_position_ids + 1
-            # The newly generated token becomes the input for the next loop iteration.
-            curr_tokens = next_tokens
     # for evaluation
     @torch.no_grad()
     def chat(

         the behavior of the original batch generation function, including the handling
         of start tokens and the end-of-sequence token.
         """
+        step = 0
         curr_tokens = packed_start_tokens
+        while step < max_length:
             packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens)
             query_lens = torch.ones_like(curr_tokens)
             packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange(
                 dtype=key_values_lens.dtype
             )
             uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
             for i in range(len(uppacked)):
                 uppacked[i] += i
             packed_query_sequence = output.packed_query_sequence
             pred_logits = self.language_model.lm_head(packed_query_sequence)
             if do_sample:
                 probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
+                curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
             else:
+                curr_tokens = torch.argmax(pred_logits, dim=-1)
             uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
             for i in range(len(uppacked)):
                 uppacked[i] = torch.cat(
             packed_key_value_indexes = torch.cat(uppacked, dim=0)
             key_values_lens = key_values_lens + 1
             packed_query_position_ids = packed_query_position_ids + 1
+            step += 1
+            yield curr_tokens # Yield each token as it's generated
+            if end_token_id is not None and curr_tokens[0] == end_token_id: # only support batch=1
+                break
     # for evaluation
     @torch.no_grad()
     def chat(