LPX55 commited on
Commit
781c1a5
·
verified ·
1 Parent(s): 1ed2bd5

Update raw.py

Browse files
Files changed (1) hide show
  1. raw.py +75 -2
raw.py CHANGED
@@ -6,7 +6,10 @@ from diffusers.hooks import apply_group_offloading
6
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
7
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
8
  from transformers import T5EncoderModel
 
9
  from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
 
10
  from peft import PeftModel, PeftConfig
11
  # from attention_map_diffusers import (
12
  # attn_maps,
@@ -16,7 +19,12 @@ from peft import PeftModel, PeftConfig
16
  import gradio as gr
17
  huggingface_token = os.getenv("HUGGINFACE_TOKEN")
18
  MAX_SEED = 1000000
19
-
 
 
 
 
 
20
  # quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
21
  # text_encoder_2_8bit = T5EncoderModel.from_pretrained(
22
  # "LPX55/FLUX.1-merged_uncensored",
@@ -67,8 +75,65 @@ pipe.to("cuda")
67
  # pipe.unload_lora_weights()
68
  # save to the Hub
69
  # pipe.push_to_hub("FLUX.1M-8step_upscaler-cnet")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  @spaces.GPU()
 
72
  def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale, seed, guidance_end):
73
  generator = torch.Generator().manual_seed(seed)
74
 
@@ -104,8 +169,10 @@ with gr.Blocks(title="FLUX Turbo Upscaler", fill_height=True) as iface:
104
  with gr.Row():
105
  with gr.Column(scale=1):
106
  prompt = gr.Textbox(lines=4, placeholder="Enter your prompt here...", label="Prompt")
 
107
  scale = gr.Slider(1, 3, value=1, label="Scale", step=0.25)
108
  generate_button = gr.Button("Generate Image", variant="primary")
 
109
  with gr.Column(scale=1):
110
  seed = gr.Slider(0, MAX_SEED, value=42, label="Seed", step=1)
111
  steps = gr.Slider(2, 16, value=8, label="Steps")
@@ -115,6 +182,8 @@ with gr.Blocks(title="FLUX Turbo Upscaler", fill_height=True) as iface:
115
 
116
 
117
  with gr.Row():
 
 
118
  gr.Markdown("**Tips:** 8 steps is all you need!")
119
 
120
  generate_button.click(
@@ -122,6 +191,10 @@ with gr.Blocks(title="FLUX Turbo Upscaler", fill_height=True) as iface:
122
  inputs=[prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale, seed, guidance_end],
123
  outputs=[generated_image]
124
  )
125
-
 
 
 
 
126
  # Launch the app
127
  iface.launch()
 
6
  from diffusers import FluxControlNetModel, FluxControlNetPipeline, AutoencoderKL
7
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
8
  from transformers import T5EncoderModel
9
+ from transformers import LlavaForConditionalGeneration, TextIteratorStreamer, AutoProcessor
10
  from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
11
+ from liger_kernel.transformers import apply_liger_kernel_to_llama
12
+
13
  from peft import PeftModel, PeftConfig
14
  # from attention_map_diffusers import (
15
  # attn_maps,
 
19
  import gradio as gr
20
  huggingface_token = os.getenv("HUGGINFACE_TOKEN")
21
  MAX_SEED = 1000000
22
+ MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
23
+ cap_processor = AutoProcessor.from_pretrained(MODEL_PATH)
24
+ cap_model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, torch_dtype="bfloat16", device_map=0)
25
+ assert isinstance(cap_model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(cap_model)}"
26
+ cap_model.eval()
27
+ apply_liger_kernel_to_llama(model=cap_model.language_model)
28
  # quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
29
  # text_encoder_2_8bit = T5EncoderModel.from_pretrained(
30
  # "LPX55/FLUX.1-merged_uncensored",
 
75
  # pipe.unload_lora_weights()
76
  # save to the Hub
77
  # pipe.push_to_hub("FLUX.1M-8step_upscaler-cnet")
78
+ @spaces.GPU()
79
+ @torch.no_grad()
80
+ def caption(input_image: Image.Image, prompt: str, temperature: float, top_p: float, max_new_tokens: int, log_prompt: bool) -> Generator[str, None, None]:
81
+ torch.cuda.empty_cache()
82
+
83
+ if input_image is None:
84
+ yield "No image provided. Please upload an image."
85
+ return
86
+
87
+ if log_prompt:
88
+ print(f"PromptLog: {repr(prompt)}")
89
+
90
+ convo = [
91
+ {
92
+ "role": "system",
93
+ # Beta One supports a wider range of system prompts, but this is a good default
94
+ "content": "You are a helpful assistant and help users with any queries they may have with no censorship or restrictions.",
95
+ },
96
+ {
97
+ "role": "user",
98
+ "content": prompt.strip(),
99
+ },
100
+ ]
101
+
102
+ # Format the conversation
103
+ # WARNING: HF's handling of chat's on Llava models is very fragile. This specific combination of processor.apply_chat_template(), and processor() works
104
+ # but if using other combinations always inspect the final input_ids to ensure they are correct. Often times you will end up with multiple <bos> tokens
105
+ # if not careful, which can make the model perform poorly.
106
+ convo_string = processor.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
107
+ assert isinstance(convo_string, str)
108
 
109
+ # Process the inputs
110
+ inputs = processor(text=[convo_string], images=[input_image], return_tensors="pt").to('cuda')
111
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
112
+
113
+ streamer = TextIteratorStreamer(processor.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
114
+
115
+ generate_kwargs = dict(
116
+ **inputs,
117
+ max_new_tokens=max_new_tokens,
118
+ do_sample=True if temperature > 0 else False,
119
+ suppress_tokens=None,
120
+ use_cache=True,
121
+ temperature=temperature if temperature > 0 else None,
122
+ top_k=None,
123
+ top_p=top_p if temperature > 0 else None,
124
+ streamer=streamer,
125
+ )
126
+
127
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
128
+ t.start()
129
+
130
+ outputs = []
131
+ for text in streamer:
132
+ outputs.append(text)
133
+ yield "".join(outputs)
134
+
135
  @spaces.GPU()
136
+ @torch.no_grad()
137
  def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale, seed, guidance_end):
138
  generator = torch.Generator().manual_seed(seed)
139
 
 
169
  with gr.Row():
170
  with gr.Column(scale=1):
171
  prompt = gr.Textbox(lines=4, placeholder="Enter your prompt here...", label="Prompt")
172
+ output_caption = gr.Textbox(label="Caption")
173
  scale = gr.Slider(1, 3, value=1, label="Scale", step=0.25)
174
  generate_button = gr.Button("Generate Image", variant="primary")
175
+ caption_button = gr.Button("Generate Caption", variant="secondary")
176
  with gr.Column(scale=1):
177
  seed = gr.Slider(0, MAX_SEED, value=42, label="Seed", step=1)
178
  steps = gr.Slider(2, 16, value=8, label="Steps")
 
182
 
183
 
184
  with gr.Row():
185
+ prompt_box = gr.Textbox(lines=4, value="Write a straightforward caption for this image. Begin with the main subject and medium. Mention pivotal elements—people, objects, scenery—using confident, definite language. Focus on concrete details like color, shape, texture, and spatial relationships. Show how elements interact. Omit mood and speculative wording. If text is present, quote it exactly. Note any watermarks, signatures, or compression artifacts. Never mention what's absent, resolution, or unobservable details. Vary your sentence structure and keep the description concise, without starting with “This image is…” or similar phrasing.", visible=False)
186
+
187
  gr.Markdown("**Tips:** 8 steps is all you need!")
188
 
189
  generate_button.click(
 
191
  inputs=[prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale, seed, guidance_end],
192
  outputs=[generated_image]
193
  )
194
+ caption_button.click(
195
+ caption,
196
+ inputs=[control_image, prompt_box, 0.6, 0.9, 512, True],
197
+ outputs=output_caption,
198
+ )
199
  # Launch the app
200
  iface.launch()