File size: 6,390 Bytes
c69a4d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2025/4/29 20:03
# @Author  : hukangzhe
# @File    : test_qw.py
# @Description : 测试两个模型(qwen1.5 qwen3)的两种输出方式(full or stream)是否正确
import os
import queue
import logging
import threading
import torch
from typing import Tuple, Generator
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextStreamer

class ThinkStreamer(TextStreamer):
    def __init__(self, tokenizer: AutoTokenizer, skip_prompt: bool =True, **decode_kwargs):
        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
        self.is_thinking = True
        self.think_end_token_id = self.tokenizer.encode("</think>", add_special_tokens=False)[0]
        self.output_queue = queue.Queue()

    def on_finalized_text(self, text: str, stream_end: bool = False):
        self.output_queue.put(text)
        if stream_end:
            self.output_queue.put(None) # 发送结束信号

    def __iter__(self):
        return self

    def __next__(self):
        value = self.output_queue.get()
        if value is None:
            raise StopIteration()
        return value

    def generate_output(self) -> Generator[Tuple[str, str], None, None]:
        full_decode_text = ""
        already_yielded_len = 0
        for text_chunk in self:
            if not self.is_thinking:
                yield "answer", text_chunk
                continue

            full_decode_text += text_chunk
            tokens = self.tokenizer.encode(full_decode_text, add_special_tokens=False)

            if self.think_end_token_id in tokens:
                spilt_point = tokens.index(self.think_end_token_id)
                think_part_tokens = tokens[:spilt_point]
                thinking_text = self.tokenizer.decode(think_part_tokens)

                answer_part_tokens = tokens[spilt_point:]
                answer_text = self.tokenizer.decode(answer_part_tokens)
                remaining_thinking = thinking_text[already_yielded_len:]
                if remaining_thinking:
                    yield "thinking", remaining_thinking

                if answer_text:
                    yield "answer", answer_text

                self.is_thinking = False
                already_yielded_len = len(thinking_text) + len(self.tokenizer.decode(self.think_end_token_id))
            else:
                yield "thinking", text_chunk
                already_yielded_len += len(text_chunk)



class LLMInterface:
    def __init__(self, model_name: str= "Qwen/Qwen3-0.6B"):
        logging.info(f"Initializing generator {model_name}")
        self.generator_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.generator_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto")
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def generate_answer(self, query: str, context_str: str) -> str:
        messages = [
            {"role": "system", "content": "你是一个问答助手,请根据提供的上下文来回答问题,不要编造信息。"},
            {"role": "user", "content": f"上下文:\n---\n{context_str}\n---\n请根据以上上下文回答这个问题:{query}"}
        ]
        prompt = self.generator_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = self.generator_tokenizer(prompt, return_tensors="pt").to(self.device)
        output = self.generator_model.generate(**inputs,
                                               max_new_tokens=256, num_return_sequences=1,
                                eos_token_id=self.generator_tokenizer.eos_token_id)
        generated_ids = output[0][inputs["input_ids"].shape[1]:]
        answer = self.generator_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        return answer

    def generate_answer_stream(self, query: str, context_str: str) -> Generator[Tuple[str, str], None, None]:
        """Generates an answer as a stream of (state, content) tuples."""
        messages = [
            {"role": "system",
             "content": "You are a helpful assistant. Please answer the question based on the provided context. First, think through the process in <think> tags, then provide the final answer."},
            {"role": "user",
             "content": f"Context:\n---\n{context_str}\n---\nBased on the context above, please answer the question: {query}"}
        ]

        # Use the template that enables thinking for Qwen models
        prompt = self.generator_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=True
        )
        model_inputs = self.generator_tokenizer([prompt], return_tensors="pt").to(self.device)

        streamer = ThinkStreamer(self.generator_tokenizer, skip_prompt=True)

        generation_kwargs = dict(
            **model_inputs,
            max_new_tokens=512,
            streamer=streamer
        )

        thread = threading.Thread(target=self.generator_model.generate, kwargs=generation_kwargs)
        thread.start()

        yield from streamer.generate_output()


# if __name__ == "__main__":
#     qwen = LLMInterface("Qwen/Qwen3-0.6B")
#     answer = qwen.generate_answer("儒家思想的创始人是谁?", "中国传统哲学以儒家、道家和法家为主要流派。儒家思想由孔子创立,强调“仁”、“义”、“礼”、“智”、“信”,主张修身齐家治国平天下,对中国社会产生了深远的影响。其核心价值观如“己所不欲,勿施于人”至今仍具有普世意义。"+
#
# "道家思想以老子和庄子为代表,主张“道法自然”,追求人与自然的和谐统一,强调无为而治、清静无为。道家思想对中国人的审美情趣、艺术创作以及养生之道都有着重要的影响。"+
#
# "法家思想以韩非子为集大成者,主张以法治国,强调君主的权威和法律的至高无上。尽管法家思想在历史上曾被用于强化中央集权,但其对建立健全的法律体系也提供了重要的理论基础。")
#
#     print(answer)