Transformers 源码解析（十）

最新推荐文章于 2025-07-24 14:11:38 发布

绝不原创的飞龙

最新推荐文章于 2025-07-24 14:11:38 发布

阅读量336

点赞数 2

CC 4.0 BY-SA版权

分类专栏： transformers 文章标签： pytorch 深度学习人工智能

License CC BY-NC-SA 4.0 / 自豪地采用谷歌翻译

本文链接：https://blog.csdn.net/wizardforcel/article/details/140123201

transformers 专栏收录该内容

310 篇文章

订阅专栏

`.\generation\stopping_criteria.py`

# 导入时间模块，用于处理时间相关功能
import time
# 导入警告模块，用于发出警告信息
import warnings
# 导入抽象基类模块，用于定义抽象类
from abc import ABC
# 导入深拷贝函数，用于创建对象的深层副本
from copy import deepcopy
# 导入类型提示模块，用于指定参数和返回值的类型
from typing import Optional

# 导入PyTorch库
import torch

# 从本地utils模块中导入指定函数和类
from ..utils import add_start_docstrings, logging

# 从logging模块中获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 停止条件的文档字符串，使用原始字符串表示（r"..."），包含参数和返回值的描述
STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
            or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
            make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional stopping criteria specific kwargs.

    Return:
        `torch.BoolTensor`. (`torch.BoolTensor` of shape `(batch_size, 1)`), where `True` indicates we stop generation
            for a particular row, `True` indicates we should continue.

"""


class StoppingCriteria(ABC):
    """Abstract base class for all stopping criteria that can be applied during generation.

    If your stopping criteria depends on the `scores` input, make sure you pass `return_dict_in_generate=True,
    output_scores=True` to `generate`.
    """

    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        # 抽象方法，子类需实现该方法来定义停止生成的具体逻辑
        raise NotImplementedError("StoppingCriteria needs to be subclassed")


class MaxLengthCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`. Keep
    in mind for decoder-only type of transformers, this will include the initial prompted tokens.

    Args:
        max_length (`int`):
            The maximum length that the output sequence can have in number of tokens.
        max_position_embeddings (`int`, *optional*):
            The maximum model length, as defined by the model's `config.max_position_embeddings` attribute.
    """

    def __init__(self, max_length: int, max_position_embeddings: Optional[int] = None):
        # 初始化最大长度和最大位置嵌入
        self.max_length = max_length
        self.max_position_embeddings = max_position_embeddings

    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    # 定义一个调用函数，用于生成文本序列的逻辑
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        # 获取当前输入序列的长度
        cur_len = input_ids.shape[-1]
        # 检查当前序列长度是否已经达到或超过最大生成长度
        is_done = cur_len >= self.max_length
        # 如果模型限制了最大位置嵌入数量且当前长度未达到生成上限，并且当前长度已经超过最大位置嵌入数量，则发出警告
        if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
            logger.warning_once(
                "This is a friendly reminder - the current text generation call will exceed the model's predefined "
                f"maximum length ({self.max_position_embeddings}). Depending on the model, you may observe "
                "exceptions, performance degradation, or nothing at all."
            )
        # 返回一个布尔张量，表示每个输入序列是否已完成生成
        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)
# 继承自 `StoppingCriteria` 类的子类 `MaxNewTokensCriteria`，用于在生成的标记数超过 `max_new_tokens` 时停止生成。
class MaxNewTokensCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`. Keep in
    mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is very
    close to `MaxLengthCriteria` but ignores the number of initial tokens.

    Args:
        start_length (`int`):
            The number of initial tokens.
        max_new_tokens (`int`):
            The maximum number of tokens to generate.
    """

    # 初始化方法，发出警告信息表明该类已被弃用，建议使用 `MaxLengthCriteria` 替代
    def __init__(self, start_length: int, max_new_tokens: int):
        warnings.warn(
            "The class `MaxNewTokensCriteria` is deprecated. "
            f"Please use `MaxLengthCriteria(max_length={start_length + max_new_tokens})` "
            "with `max_length = start_length + max_new_tokens` instead.",
            FutureWarning,
        )
        # 初始化属性，记录初始标记数和允许生成的最大标记数
        self.start_length = start_length
        self.max_new_tokens = max_new_tokens
        self.max_length = start_length + max_new_tokens

    # 调用对象时的方法，检查是否达到生成的最大标记数
    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        # 判断输入标记的长度是否大于等于设定的最大长度
        is_done = input_ids.shape[-1] >= self.max_length
        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)


# 继承自 `StoppingCriteria` 类的子类 `MaxTimeCriteria`，用于在生成时间超过 `max_time` 秒时停止生成。
class MaxTimeCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
    time will start being counted when you initialize this function. You can override this by passing an
    `initial_time`.

    Args:
        max_time (`float`):
            The maximum allowed time in seconds for the generation.
        initial_time (`float`, *optional*, defaults to `time.time()`):
            The start of the generation allowed time.
    """

    # 初始化方法，记录最大允许生成时间和开始计时的时间戳（默认为当前时间）
    def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
        self.max_time = max_time
        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp

    # 调用对象时的方法，检查是否超过了允许的生成时间
    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        # 计算当前时间与初始时间戳之间的差值，判断是否超过了最大允许时间
        is_done = time.time() - self.initial_timestamp > self.max_time
        return torch.full((input_ids.shape[0],), is_done, device=input_ids.device, dtype=torch.bool)


# 继承自列表的子类 `StoppingCriteriaList`，用于存储多个停止生成的条件，并在任何一个条件满足时停止生成。
class StoppingCriteriaList(list):
    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.BoolTensor:
        # 初始化一个全为 False 的 torch.BoolTensor，表示生成未完成
        is_done = torch.full((input_ids.shape[0],), False, device=input_ids.device)
        # 遍历存储的所有停止条件，如果任何一个条件返回 True，则更新 is_done 为 True
        for criteria in self:
            is_done = is_done | criteria(input_ids, scores, **kwargs)
        return is_done
    # 定义一个方法 `max_length`，返回类型是可选的整数（可能为None）
    def max_length(self) -> Optional[int]:
        # 遍历当前对象实例中的每一个停止条件
        for stopping_criterium in self:
            # 如果当前停止条件是 `MaxLengthCriteria` 类型的实例
            if isinstance(stopping_criterium, MaxLengthCriteria):
                # 返回 `MaxLengthCriteria` 实例中定义的最大长度
                return stopping_criterium.max_length
            # 如果当前停止条件是 `MaxNewTokensCriteria` 类型的实例
            elif isinstance(stopping_criterium, MaxNewTokensCriteria):
                # 返回 `MaxNewTokensCriteria` 实例中定义的最大长度
                return stopping_criterium.max_length
        # 如果没有找到符合条件的停止条件，返回 None
        return None
# 定义一个函数，用于验证停止条件列表是否符合规范，并返回更新后的停止条件列表对象
def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList:
    # 获取停止条件列表中的最大长度
    stopping_max_length = stopping_criteria.max_length
    # 深度复制原始的停止条件列表对象，以免修改原始数据
    new_stopping_criteria = deepcopy(stopping_criteria)
    
    # 如果停止条件列表中的最大长度存在，并且与传入的 max_length 参数不相等
    if stopping_max_length is not None and stopping_max_length != max_length:
        # 发出警告，指出设置的停止条件最大长度与传入参数的最大长度不一致
        warnings.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning)
    # 如果停止条件列表中的最大长度不存在
    elif stopping_max_length is None:
        # 向新的停止条件列表中添加一个新的最大长度停止条件对象
        new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
    
    # 返回更新后的停止条件列表对象
    return new_stopping_criteria

`.\generation\streamers.py`

# 从队列模块导入队列类
from queue import Queue
# 导入类型检查工具，用于类型提示
from typing import TYPE_CHECKING, Optional

# 如果 TYPE_CHECKING 为真，则从 ..models.auto 模块导入 AutoTokenizer 类
if TYPE_CHECKING:
    from ..models.auto import AutoTokenizer

# 基础流生成器的基类，用于所有生成器流类的继承
class BaseStreamer:
    """
    Base class from which `.generate()` streamers should inherit.
    """

    def put(self, value):
        """Function that is called by `.generate()` to push new tokens"""
        # 抛出未实现错误，子类需要实现该方法
        raise NotImplementedError()

    def end(self):
        """Function that is called by `.generate()` to signal the end of generation"""
        # 抛出未实现错误，子类需要实现该方法
        raise NotImplementedError()


class TextStreamer(BaseStreamer):
    """
    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.

    <Tip warning={true}>

    The API for the streamer classes is still under development and may change in the future.

    </Tip>

    Parameters:
        tokenizer (`AutoTokenizer`):
            The tokenized used to decode the tokens.
        skip_prompt (`bool`, *optional*, defaults to `False`):
            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
        decode_kwargs (`dict`, *optional*):
            Additional keyword arguments to pass to the tokenizer's `decode` method.

    Examples:

        ```
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
        >>> streamer = TextStreamer(tok)

        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
        ```
    """

    def __init__(self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs):
        # 初始化方法，接收一个自动标记器实例和可选参数
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.decode_kwargs = decode_kwargs

        # 用于流处理的变量
        self.token_cache = []  # 初始化空的标记缓存列表
        self.print_len = 0  # 初始化打印长度为 0
        self.next_tokens_are_prompt = True  # 初始化下一个标记为提示状态
    def put(self, value):
        """
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
        """
        # 检查输入值的维度和批处理大小是否符合要求
        if len(value.shape) > 1 and value.shape[0] > 1:
            raise ValueError("TextStreamer only supports batch size 1")
        elif len(value.shape) > 1:
            value = value[0]

        # 如果设置跳过提示且下一个标记是提示，则跳过处理
        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return

        # 将新标记添加到缓存并进行解码
        self.token_cache.extend(value.tolist())
        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)

        # 如果文本以换行符结尾，则刷新缓存
        if text.endswith("\n"):
            printable_text = text[self.print_len :]
            self.token_cache = []
            self.print_len = 0
        # 如果最后一个标记是CJK字符，则打印这些字符
        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
            printable_text = text[self.print_len :]
            self.print_len += len(printable_text)
        # 否则，打印直到最后一个空格字符（简单的启发式方法，避免打印不完整的单词）
        else:
            printable_text = text[self.print_len : text.rfind(" ") + 1]
            self.print_len += len(printable_text)

        # 调用处理最终文本的回调函数
        self.on_finalized_text(printable_text)

    def end(self):
        """Flushes any remaining cache and prints a newline to stdout."""
        # 如果缓存中还有剩余内容，则刷新缓存
        if len(self.token_cache) > 0:
            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
            printable_text = text[self.print_len :]
            self.token_cache = []
            self.print_len = 0
        else:
            printable_text = ""

        # 设置下一个标记为提示
        self.next_tokens_are_prompt = True
        # 调用处理最终文本的回调函数，并标志流结束
        self.on_finalized_text(printable_text, stream_end=True)

    def on_finalized_text(self, text: str, stream_end: bool = False):
        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
        # 将新文本输出到标准输出，如果流结束则打印换行符
        print(text, flush=True, end="" if not stream_end else None)
    # 检查给定的代码点（CP）是否是CJK字符的代码点
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # 这里定义的“中文字符”是指CJK统一表意字符（Unicode块）中的任何字符：
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # 需要注意，尽管名称中包含CJK统一表意字符，但并非所有日文和韩文字符都包含在内。
        # 现代韩文Hangul字母使用了不同的Unicode块，日文的平假名和片假名也是如此。
        # 这些字母用于书写以空格分隔的词语，因此不被特别对待，会像其他语言一样处理。
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)            # CJK统一表意字符（4E00-9FFF）
            or (cp >= 0x3400 and cp <= 0x4DBF)        # CJK统一表意字符扩展A（3400-4DBF）
            or (cp >= 0x20000 and cp <= 0x2A6DF)      # CJK统一表意字符扩展B（20000-2A6DF）
            or (cp >= 0x2A700 and cp <= 0x2B73F)      # CJK统一表意字符扩展C（2A700-2B73F）
            or (cp >= 0x2B740 and cp <= 0x2B81F)      # CJK统一表意字符扩展D（2B740-2B81F）
            or (cp >= 0x2B820 and cp <= 0x2CEAF)      # CJK兼容扩展（2B820-2CEAF）
            or (cp >= 0xF900 and cp <= 0xFAFF)        # CJK兼容象形文字（F900-FAFF）
            or (cp >= 0x2F800 and cp <= 0x2FA1F)      # CJK兼容表意文字补充（2F800-2FA1F）
        ):  # 如果CP位于任何上述范围内，则返回True，表示是中文字符
            return True

        # 如果不在以上范围内，则返回False，表示不是中文字符
        return False
class TextIteratorStreamer(TextStreamer):
    """
    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
    useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
    Gradio demo).

    <Tip warning={true}>

    The API for the streamer classes is still under development and may change in the future.

    </Tip>

    Parameters:
        tokenizer (`AutoTokenizer`):
            The tokenizer used to decode the tokens.
        skip_prompt (`bool`, *optional*, defaults to `False`):
            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
        timeout (`float`, *optional*):
            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
            in `.generate()`, when it is called in a separate thread.
        decode_kwargs (`dict`, *optional*):
            Additional keyword arguments to pass to the tokenizer's `decode` method.

    Examples:

        ```
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
        >>> from threading import Thread

        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
        >>> streamer = TextIteratorStreamer(tok)

        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
        >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
        >>> thread.start()
        >>> generated_text = ""
        >>> for new_text in streamer:
        ...     generated_text += new_text
        >>> generated_text
        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
        ```
    """

    def __init__(
        self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: Optional[float] = None, **decode_kwargs
    ):
        # 调用父类的初始化方法，传递 tokenizer 和 decode_kwargs
        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
        # 创建一个队列来存储生成的文本
        self.text_queue = Queue()
        # 初始化停止信号为 None
        self.stop_signal = None
        # 设置超时时间
        self.timeout = timeout

    def on_finalized_text(self, text: str, stream_end: bool = False):
        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
        # 将新生成的文本放入队列中，如果流结束，则也放入停止信号
        self.text_queue.put(text, timeout=self.timeout)
        if stream_end:
            self.text_queue.put(self.stop_signal, timeout=self.timeout)

    def __iter__(self):
        # 返回迭代器自身
        return self

    def __next__(self):
        # 从队列中获取值，如果是停止信号则抛出 StopIteration 异常，否则返回值
        value = self.text_queue.get(timeout=self.timeout)
        if value == self.stop_signal:
            raise StopIteration()
        else:
            return value

`.\generation\tf_logits_process.py`

# 导入模块inspect用于检查对象，并从typing导入List和Tuple
import inspect
from typing import List, Tuple

# 导入NumPy和TensorFlow库
import numpy as np
import tensorflow as tf

# 从上级目录的tf_utils模块导入stable_softmax函数
from ..tf_utils import stable_softmax
# 从上级目录的utils模块导入add_start_docstrings函数
from ..utils import add_start_docstrings
# 从utils.logging模块导入get_logger函数
from ..utils.logging import get_logger

# 使用get_logger函数获取当前模块的日志记录器
logger = get_logger(__name__)

# TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING是一个原始字符串，描述了TFLogitsProcessor和TFLogitsWarper类中__call__方法的参数和返回值
TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        scores (`tf.Tensor` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
            search or log softmax for each vocabulary token when using beam search.
        cur_len (`int`):
            The current length of valid input sequence tokens. In the TF implementation, the input_ids' sequence length
            is the maximum length generate can produce, and we need to know which of its tokens are valid.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional logits processor specific kwargs.

    Return:
        `tf.Tensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
"""

# TFLogitsProcessor类定义了一个抽象基类，用于在生成过程中应用的所有logit处理器
class TFLogitsProcessor:
    """Abstract base class for all logit processors that can be applied during generation."""

    # 使用add_start_docstrings装饰器，添加了TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING作为文档字符串
    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        """TF method for processing logits."""
        # 抛出未实现错误，提示该类是抽象类，只能由继承它的类调用
        raise NotImplementedError(
            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
        )

# TFLogitsWarper类定义了一个抽象基类，用于在生成过程中使用多项式抽样时应用的所有logit包装器
class TFLogitsWarper:
    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""

    # 使用add_start_docstrings装饰器，添加了TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING作为文档字符串
    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        """TF method for warping logits."""
        # 抛出未实现错误，提示该类是抽象类，只能由继承它的类调用
        raise NotImplementedError(
            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
        )
# 定义一个继承自列表的类 `TFLogitsProcessorList`，用于存储一组 `TFLogitsProcessor` 对象，以便后续处理输入张量 `scores`。
# 该类添加了特定的 `__call__` 方法，用于对每个 `TFLogitsProcessor` 对象应用处理。
class TFLogitsProcessorList(list):
    
    # 使用装饰器 `add_start_docstrings` 应用输入参数的文档字符串 `TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING`
    @add_start_docstrings(TF_LOGITS_PROCESSOR_INPUTS_DOCSTRING)
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int, **kwargs) -> tf.Tensor:
        # 遍历列表中的每个处理器 `processor`
        for processor in self:
            # 检索处理器 `processor` 的调用方法的参数列表
            function_args = inspect.signature(processor.__call__).parameters
            # 如果参数个数超过 3
            if len(function_args) > 3:
                # 检查是否传递了所有必需的参数到 `processor` 的调用方法
                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
                    raise ValueError(
                        f"Make sure that all the required parameters: {list(function_args.keys())} for "
                        f"{processor.__class__} are passed to the logits processor."
                    )
                # 调用 `processor` 的方法，并更新 `scores`
                scores = processor(input_ids, scores, cur_len, **kwargs)
            else:
                # 否则，调用 `processor` 的方法，并更新 `scores`
                scores = processor(input_ids, scores, cur_len)
        # 返回处理后的 `scores`
        return scores


# 定义一个继承自 `TFLogitsWarper` 的类 `TFTemperatureLogitsWarper`
# 用于温度调节（指数缩放输出概率分布）的 `TFLogitsWarper`
class TFTemperatureLogitsWarper(TFLogitsWarper):
    
    # 初始化方法，接受一个 `temperature` 参数作为温度值
    def __init__(self, temperature: float):
        # 如果 `temperature` 不是 `float` 类型或者不是严格正数，则抛出 `ValueError`
        if not isinstance(temperature, float) or not (temperature > 0):
            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
        
        # 将 `temperature` 赋值给实例变量 `self.temperature`
        self.temperature = temperature
    
    # 调用方法，接受 `input_ids`、`scores`、`cur_len` 参数，返回处理后的 `scores`
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 将 `scores` 按 `self.temperature` 进行缩放处理
        scores = scores / self.temperature
        # 返回处理后的 `scores`
        return scores


# 定义一个继承自 `TFLogitsWarper` 的类 `TFTopKLogitsWarper`
# 用于进行 top-k 操作的 `TFLogitsWarper`，即保留概率最高的 `top_k` 个元素
class TFTopKLogitsWarper(TFLogitsWarper):
    
    # 初始化方法，接受 `top_k`、`filter_value`（可选，默认为 `-inf`）、`min_tokens_to_keep`（可选，默认为 `1`）三个参数
    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        # 如果 `top_k` 不是正整数，则抛出 `ValueError`
        if not isinstance(top_k, int) or top_k <= 0:
            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
        
        # 将 `top_k` 和 `min_tokens_to_keep` 中的最大值赋值给实例变量 `self.top_k`
        self.top_k = max(top_k, min_tokens_to_keep)
        # 将 `filter_value` 赋值给实例变量 `self.filter_value`
        self.filter_value = filter_value
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 计算实际需要考虑的top_k值，确保不超过scores张量的最后一个维度的大小
        top_k = min(self.top_k, scores.shape[-1])  # Safety check
        
        # 创建一个布尔遮罩，标记所有概率小于top-k中最后一个概率的token
        indices_to_remove = scores < tf.math.top_k(scores, k=top_k)[0][..., -1:]
        
        # 根据遮罩，将需要移除的token对应的分数替换为过滤值self.filter_value
        next_scores = tf.where(indices_to_remove, self.filter_value, scores)
        
        # 返回更新后的分数张量
        return next_scores
    # `TFLogitsWarper`的子类，执行top-p截断，即限制保留加起来小于等于prob_cut_off的前几个最有可能的token。

    Args:
        top_p (`float`):
            如果设置为小于1的值，则只保留概率相加达到`top_p`或更高的最有可能的token用于生成。
        filter_value (`float`, *optional*, 默认为负无穷):
            所有被过滤的值将被设置为这个浮点数值。
        min_tokens_to_keep (`int`, *optional*, 默认为1):
            不能被过滤掉的最小token数目。
    """

    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        # 检查top_p是否为浮点数且在0到1之间
        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
            raise ValueError(f"`top_p`必须是一个大于0且小于1的浮点数，当前值为{top_p}")
        # 检查min_tokens_to_keep是否为正整数
        if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
            raise ValueError(f"`min_tokens_to_keep`必须是一个正整数，当前值为{min_tokens_to_keep}")

        # 初始化实例变量
        self.top_p = top_p
        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep

    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 获取前k个最高分数和对应的索引
        topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1])

        # 创建与scores相同形状的填充值为filter_value的张量
        mask_scores = tf.fill(scores.shape, self.filter_value)
        # 计算topk_scores的稳定softmax，并累积概率
        cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1)
        # 创建一个布尔掩码，标记累积概率小于top_p的位置
        score_mask = cumulative_probs < self.top_p

        # 将第一个false替换为true，确保包含大于top_p的token
        score_mask = tf.concat((tf.ones([score_mask.shape[0], 1], dtype=tf.bool), score_mask[:, :-1]), axis=-1)

        # 确保保留至少min_tokens_to_keep个token
        score_mask = tf.concat(
            (
                tf.ones([score_mask.shape[0], self.min_tokens_to_keep], dtype=tf.bool),
                score_mask[:, self.min_tokens_to_keep:],
            ),
            axis=-1,
        )

        # 根据掩码将不符合条件的值设为filter_value
        topk_next_scores = tf.where(score_mask, topk_scores, mask_scores)

        # 恢复topk排序的顺序：将原始索引位置重新分散到张量中
        scatter_rows = tf.tile(tf.expand_dims(tf.range(topk_indices.shape[0]), axis=-1), [1, topk_indices.shape[-1]])
        scatter_indices = tf.stack((scatter_rows, topk_indices), axis=-1)
        next_scores = tf.scatter_nd(scatter_indices, topk_next_scores, shape=topk_next_scores.shape)

        return next_scores
    # 定义一个 TFLogitsProcessor 类，用于处理 logits（预测得分），实现通过设置 EOS 概率为 0 来强制最小长度。

    Args:
        min_length (`int`):
            最小长度，低于此长度时，`eos_token_id` 的得分被设置为 `-float("Inf")`。
        eos_token_id (`int`):
            *end-of-sequence*（EOS）标记的 id。
    """

    def __init__(self, min_length: int, eos_token_id: int):
        # 检查并设置 `min_length` 参数，必须为正整数
        if not isinstance(min_length, int) or min_length < 0:
            raise ValueError(f"`min_length` 必须是正整数，但其值为 {min_length}")

        # 检查并设置 `eos_token_id` 参数，必须为正整数
        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` 必须是正整数，但其值为 {eos_token_id}")

        # 初始化对象的属性
        self.min_length = min_length
        self.eos_token_id = eos_token_id

    def _apply_eos_token_mask(self, scores: tf.Tensor) -> tf.Tensor:
        # 创建一个掩码，标记出 scores 中等于 eos_token_id 的位置
        eos_token_id_mask = tf.range(scores.shape[-1]) == self.eos_token_id
        # 使用 tf.where 函数将 eos_token_id 的位置对应的 scores 设置为 -inf
        scores = tf.where(eos_token_id_mask, float("-inf"), scores)
        return scores

    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 如果当前长度 cur_len 小于 min_length，则应用 eos token 掩码
        scores = tf.cond(
            tf.less(cur_len, self.min_length),
            lambda: self._apply_eos_token_mask(scores),
            lambda: tf.identity(scores),
        )
        return scores
class TFRepetitionPenaltyLogitsProcessor(TFLogitsProcessor):
    r"""
    [`TFLogitsProcessor`] enforcing an exponential penalty on repeated sequences.

    Args:
        repetition_penalty (`float`):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    """

    def __init__(self, penalty: float):
        # 检查 penalty 参数是否为正浮点数，若不是则抛出 ValueError 异常
        if not isinstance(penalty, float) or not (penalty > 0):
            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")

        self.penalty = penalty

    def _create_score_penalties(self, input_ids: tf.Tensor, logits: tf.Tensor) -> tf.Tensor:
        # 我们希望在 `input_ids` 的位置上填充惩罚值。由于 XLA 不能处理运行时未知的形状，
        # 不能使用 `tf.unique`。因此，当给定行中的同一标记出现多次时，可能会有冗余更新。

        # 收集要应用的惩罚值
        logit_penalties = tf.gather(logits, input_ids, axis=1, batch_dims=1)
        logit_penalties = tf.where(logit_penalties > 0, 1 / self.penalty, logit_penalties)
        logit_penalties = tf.where(logit_penalties < 0, self.penalty, logit_penalties)

        # 分散惩罚值
        token_penalties = tf.ones(logits.shape)
        batch_size = input_ids.shape[0]
        seq_len = tf.shape(input_ids)[1]  # 序列长度具有动态大小，因此使用动态形状
        indexable_prev_input_ids = tf.concat(
            (
                tf.expand_dims(tf.repeat(tf.range(batch_size), seq_len), axis=-1),
                tf.expand_dims(tf.reshape(input_ids, [-1]), axis=-1),
            ),
            axis=1,
        )
        token_penalties = tf.tensor_scatter_nd_update(
            token_penalties, indices=indexable_prev_input_ids, updates=tf.reshape(logit_penalties, [-1])
        )
        return token_penalties

    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 创建分数惩罚
        score_penalties = self._create_score_penalties(input_ids[:, :cur_len], scores)

        # 将分数乘以相应的惩罚值
        scores = tf.math.multiply(scores, score_penalties)

        return scores


class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
    """
    [`TFLogitsProcessor`] that enforces that specified sequences will never be sampled.
    """
    Args:
        bad_words_ids (`List[List[int]]`):
            不允许生成的令牌 ID 列表的列表。为了获取不应出现在生成文本中的词汇的令牌，请确保在初始化分词器时设置 `add_prefix_space=True`，并使用 `tokenizer(bad_words, add_special_tokens=False).input_ids` 来获取这些词汇的令牌 ID 列表。对于某些较慢的分词器，`add_prefix_space` 参数是支持的，因为快速分词器的前缀行为来自于 `pre tokenizers`。详细信息请参阅 [这里](https://huggingface.co/docs/tokenizers/api/pre-tokenizers)。
        eos_token_id (`int`):
            *end-of-sequence*（EOS）令牌的 ID。
    """

    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
        # 检查 `bad_words_ids` 是否为列表且非空
        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
            raise ValueError(f"`bad_words_ids` 必须是非空列表，当前为 {bad_words_ids}。")
        # 检查 `bad_words_ids` 中的每个元素是否为列表
        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
            raise ValueError(f"`bad_words_ids` 必须是列表的列表，当前为 {bad_words_ids}。")
        # 检查 `bad_words_ids` 中的每个元素是否为正整数列表
        if any(
            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
            for bad_word_ids in bad_words_ids
        ):
            raise ValueError(
                f"`bad_words_ids` 中的每个列表必须是正整数列表，当前为 {bad_words_ids}。"
            )

        # 存储关于不允许的词汇的信息，使用三个张量：
        # 1. 一个矩形张量，包含禁止序列（用 `-1` 填充），用于完整数据比较
        self.bad_word_seqs_ids = tf.ragged.constant(bad_words_ids).to_tensor(default_value=-1)
        # 2. 一个张量，包含每个禁止序列的未填充长度，用于快速长度比较
        bad_word_seqs_len = [len(bad_words) for bad_words in bad_words_ids]
        # 检查禁止词汇序列的长度是否为零
        if any(word_len == 0 for word_len in bad_word_seqs_len):
            raise ValueError(f"禁止词汇序列 {bad_words_ids} 不能包含空列表")
        self.bad_word_seqs_len = tf.convert_to_tensor(bad_word_seqs_len, dtype=tf.int32)
        # 3. 一个张量，包含每个序列的最后一个令牌，便于访问可能被禁止的令牌
        self.seq_forbidden_tokens = tf.convert_to_tensor([bad_words[-1] for bad_words in bad_words_ids])
    def _calc_row_banned_bad_tokens(self, row_input_ids: tf.Tensor) -> tf.Tensor:
        def _tokens_match(bad_word_seq_number):
            def _len_one():
                # 如果坏序列只有一个标记，则始终屏蔽它
                return tf.cond(
                    tf.math.equal(self.bad_word_seqs_len[bad_word_seq_number], 1),
                    lambda: tf.ones((), dtype=tf.bool),
                    _len_greater_than_cur_len,
                )

            def _len_greater_than_cur_len():
                # 否则，如果坏序列比当前长度长，它们永远不会匹配
                return tf.cond(
                    tf.math.greater(self.bad_word_seqs_len[bad_word_seq_number], tf.shape(row_input_ids)[0]),
                    lambda: tf.zeros((), dtype=tf.bool),
                    _match_found,
                )

            def _match_found():
                # 最后，执行实际的比较。只有在之前的比较没有结果时才能调用（否则会导致索引异常）
                compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
                return tf.cond(
                    tf.math.reduce_all(
                        tf.math.equal(
                            row_input_ids[-compare_len:], self.bad_word_seqs_ids[bad_word_seq_number, :compare_len]
                        )
                    ),
                    lambda: tf.ones((), dtype=tf.bool),
                    lambda: tf.zeros((), dtype=tf.bool),
                )

            match = _len_one()
            return match

        # 将当前行与所有坏词序列进行比较，获取匹配的掩码
        match_mask = tf.map_fn(_tokens_match, tf.range(self.bad_word_seqs_ids.shape[0]), fn_output_signature=tf.bool)
        row_banned_tokens = self.seq_forbidden_tokens[match_mask]
        return row_banned_tokens
        # 定义一个调用函数，接受输入的 `input_ids`（Tensor 类型）、分数 `scores`（Tensor 类型）、当前长度 `cur_len`（整数类型），返回更新后的分数 `scores`（Tensor 类型）
        # 我们希望在分数级别上屏蔽一些被禁止的令牌。由于被禁止的令牌取决于前一个 `input_ids`，它们可能对每一行具有不同的长度，甚至对某些行来说可能为空。
        # 为了保持简单并与 XLA 兼容，我们以逐行的方式进行操作。
        # TODO（Joao）：这个函数可能会因为 `cur_len` 的增加而触发 XLA 重追踪。如果这成为频繁的瓶颈，请修复它。（将 `cur_len` 设为一个张量？）
        def _get_row_updated_score(row_inputs: Tuple[tf.Tensor]) -> tf.Tensor:
            # 获取当前行的输入 `row_input_ids` 和分数 `row_score`
            row_input_ids, row_score = row_inputs
            # 计算当前行被禁止的坏令牌列表，基于 `row_input_ids` 的前 `cur_len` 部分
            banned_tokens = self._calc_row_banned_bad_tokens(row_input_ids[:cur_len])
            # 创建一个布尔类型的张量，表示被禁止的令牌的位置，其形状与 `row_score` 相同
            banned_tokens_mask = tf.scatter_nd(
                indices=tf.expand_dims(banned_tokens, axis=-1),
                updates=tf.ones_like(banned_tokens, dtype=tf.bool),
                shape=row_score.shape,
            )
            # 使用 `-inf` 替换被禁止令牌的位置上的分数，保持其它位置不变
            row_score = tf.where(banned_tokens_mask, -float("inf"), row_score)
            return row_score
        
        # 对每一行调用 `_get_row_updated_score` 函数，更新分数 `scores`，并返回更新后的 `scores`
        scores = tf.map_fn(_get_row_updated_score, (input_ids, scores), fn_output_signature=tf.float32)
        return scores
class TFNoRepeatNGramLogitsProcessor(TFLogitsProcessor):
    r"""
    [`TFLogitsProcessor`] that enforces no repetition of n-grams. See
    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).

    Args:
        ngram_size (`int`):
            All ngrams of size `ngram_size` can only occur once.
    """

    def __init__(self, ngram_size: int):
        # 初始化方法，验证并设置 ngram_size 参数
        if not isinstance(ngram_size, int) or ngram_size <= 0:
            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
        self.ngram_size = ngram_size

    def calc_banned_ngram_tokens(self, input_ids, num_hypos, cur_len):
        # 计算禁止的 ngram tokens，用于防止 ngram 重复
        # 从 fairseq 中复制用于在 beam search 中实现 no_repeat_ngram
        if cur_len + 1 < self.ngram_size:
            # 如果当前长度加 1 小于 ngram_size，返回空列表表示没有禁止的 token
            return [[] for _ in range(num_hypos)]
        generated_ngrams = [{} for _ in range(num_hypos)]
        prev_input_ids = input_ids[:, :cur_len]
        for idx in range(num_hypos):
            gen_tokens = prev_input_ids[idx].numpy().tolist()
            generated_ngram = generated_ngrams[idx]
            for ngram in zip(*[gen_tokens[i:] for i in range(self.ngram_size)]):
                prev_ngram_tuple = tuple(ngram[:-1])
                generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]

        def _get_generated_ngrams(hypo_idx):
            # 在解码下一个 token 前，防止解码已经出现的 ngrams
            start_idx = cur_len + 1 - self.ngram_size
            ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
            return generated_ngrams[hypo_idx].get(ngram_idx, [])

        # 返回禁止的 tokens 列表
        banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]

        return banned_tokens

    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 调用对象时的处理方法，用于处理 logits
        # TODO (joao): enable XLA on this logits processor. See discussion and attempts in
        # https://github.com/huggingface/transformers/pull/16974
        if not tf.executing_eagerly():
            raise NotImplementedError("TFNoRepeatNGramLogitsProcessor is only implemented for eager execution.")

        batch_size, vocab_size = scores.shape
        # 计算禁止的 ngram tokens
        banned_tokens = self.calc_banned_ngram_tokens(input_ids, batch_size, cur_len)

        # 创建禁止 tokens 的布尔掩码
        banned_tokens_indices_mask = []
        for banned_tokens_slice in banned_tokens:
            banned_tokens_indices_mask.append(
                [True if token in banned_tokens_slice else False for token in range(vocab_size)]
            )

        # 将禁止的 tokens 对应位置的 logits 设置为负无穷
        scores = tf.where(tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf"), scores)

        return scores


class TFForcedBOSTokenLogitsProcessor(TFLogitsProcessor):
    r"""
    # 初始化函数，接受强制作为第一个生成标记的标记 ID
    def __init__(self, bos_token_id: int):
        # 如果 bos_token_id 小于 0，则引发值错误异常
        if bos_token_id < 0:
            raise ValueError(f"The forced bos token id must be a non-negative integer, got {bos_token_id}")
        # 将传入的 bos_token_id 分配给实例变量
        self.bos_token_id = bos_token_id

    # 调用函数，处理输入的 token IDs 和对应的分数，根据当前生成的长度 cur_len 进行调整
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 如果当前生成的长度为 1
        if cur_len == 1:
            # 获取批处理大小和标记数
            batch_size, num_tokens = scores.shape
            # 将 bos_token_id 列的分数设为 0
            scores = tf.zeros((batch_size, 1))
            # 如果 bos_token_id 大于 0，将除了第 bos_token_id 列外的分数设置为负无穷
            if self.bos_token_id > 0:
                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.bos_token_id)), scores), axis=-1)
            # 如果 bos_token_id 小于 (num_tokens - 1)，将除了第 bos_token_id 列外的分数设置为负无穷
            if self.bos_token_id < (num_tokens - 1):
                scores = tf.concat(
                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.bos_token_id))),
                    axis=-1,
                )
        # 返回调整后的分数张量
        return scores
# 定义一个继承自 `TFLogitsProcessor` 的类，用于在达到 `max_length` 时强制指定的 token 成为生成序列的最后一个 token。
class TFForcedEOSTokenLogitsProcessor(TFLogitsProcessor):
    r"""
    [`TFLogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.

    Args:
        max_length (`int`):
            The maximum length of the sequence to be generated.
        eos_token_id (`int`):
            The id of the token to force as the last generated token when `max_length` is reached.
    """

    # 初始化方法，设置 `max_length` 和 `eos_token_id`
    def __init__(self, max_length: int, eos_token_id: int):
        self.max_length = max_length
        # 如果 `eos_token_id` 小于 0，则抛出错误
        if eos_token_id < 0:
            raise ValueError(f"The forced eos token id must be a non-negative integer, got {eos_token_id}")
        self.eos_token_id = eos_token_id

    # 调用方法，根据当前生成的长度 `cur_len` 对 `scores` 进行处理
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 当当前长度 `cur_len` 等于 `max_length - 1` 时
        if cur_len == self.max_length - 1:
            batch_size, num_tokens = scores.shape
            # 将 `scores` 在 `eos_token_id` 列上的值设为 0
            scores = tf.zeros((batch_size, 1))
            # 在除了 `eos_token_id` 外的其他位置上的值设为负无穷
            if self.eos_token_id > 0:
                scores = tf.concat((tf.broadcast_to(-float("inf"), (batch_size, self.eos_token_id)), scores), axis=-1)
            if self.eos_token_id < (num_tokens - 1):
                scores = tf.concat(
                    (scores, tf.broadcast_to(-float("inf"), (batch_size, (num_tokens - 1) - self.eos_token_id))),
                    axis=-1,
                )
        return scores


# 定义一个继承自 `TFLogitsProcessor` 的类，用于在生成序列开始时抑制一组 token 的生成。
class TFSuppressTokensAtBeginLogitsProcessor(TFLogitsProcessor):
    r"""
    [`TFSuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
    sampled at the begining of the generation.
    """

    # 初始化方法，设置 `begin_suppress_tokens` 和 `begin_index`
    def __init__(self, begin_suppress_tokens, begin_index):
        self.begin_suppress_tokens = list(begin_suppress_tokens)
        self.begin_index = begin_index

    # 调用方法，根据当前生成的长度 `cur_len` 对 `scores` 进行处理
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 当当前长度 `cur_len` 等于 `begin_index` 时
        scores = tf.cond(
            tf.equal(cur_len, self.begin_index),
            # 使用 `tf.tensor_scatter_nd_update` 将 `scores` 中指定位置的值更新为负无穷
            lambda: tf.tensor_scatter_nd_update(
                scores,
                indices=[[i, token] for i in range(scores.shape[0]) for token in self.begin_suppress_tokens],
                updates=[-float("inf") for _ in range(scores.shape[0] * len(self.begin_suppress_tokens))],
            ),
            lambda: scores,  # 如果条件不满足，返回原始的 `scores`
        )
        return scores


# 定义一个继承自 `TFLogitsProcessor` 的类，用于抑制一组 token 的生成。
class TFSuppressTokensLogitsProcessor(TFLogitsProcessor):
    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
    are not sampled."""

    # 初始化方法，设置 `suppress_tokens`
    def __init__(self, suppress_tokens):
        self.suppress_tokens = list(suppress_tokens)
    # 定义一个方法 __call__，该方法接受三个参数：input_ids 是 tf.Tensor 类型，scores 是 tf.Tensor 类型，cur_len 是整数类型
    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 使用 tf.tensor_scatter_nd_update 函数更新 scores 张量
        scores = tf.tensor_scatter_nd_update(
            # 更新的目标张量是 scores
            scores,
            # 更新操作的索引是一个列表推导式，生成所有 (i, token) 对的索引
            indices=[[i, token] for i in range(scores.shape[0]) for token in self.suppress_tokens],
            # 更新操作的值是一个列表推导式，生成所有需要更新的 -inf 值
            updates=[-float("inf") for _ in range(scores.shape[0] * len(self.suppress_tokens))],
        )
        # 返回更新后的 scores 张量
        return scores
class TFForceTokensLogitsProcessor(TFLogitsProcessor):
    r"""This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
    indices that will be forced before sampling. The processor will set their log probs to `0` and all other tokens to
    `-inf` so that they are sampled at their corresponding index."""

    def __init__(self, force_token_map: List[List[int]]):
        # 将输入的强制 token 映射列表转换为字典形式，格式为 {index: token}
        force_token_map = dict(force_token_map)
        
        # 创建一个数组 force_token_array，其长度为 force_token_map 中最大的索引加一，
        # 初始化所有元素为 -1，用于表示未被强制的 token
        force_token_array = np.ones((max(force_token_map.keys()) + 1), dtype=np.int32) * -1
        
        # 遍历 force_token_map，将指定索引位置的 token 值存入 force_token_array
        for index, token in force_token_map.items():
            if token is not None:
                force_token_array[index] = token
        
        # 将 force_token_array 转换为 TensorFlow 张量，并存储在实例变量 self.force_token_array 中
        self.force_token_array = tf.convert_to_tensor(force_token_array, dtype=tf.int32)

    def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
        # 定义内部函数 _force_token，用于处理强制 token 的逻辑
        def _force_token(generation_idx):
            batch_size = scores.shape[0]
            current_token = self.force_token_array[generation_idx]

            # 创建一个新的得分张量 new_scores，初始化为 -inf
            new_scores = tf.ones_like(scores, dtype=scores.dtype) * -float("inf")
            
            # 创建索引张量 indices，用于更新 new_scores 中的特定位置为 0
            indices = tf.stack((tf.range(batch_size), tf.tile([current_token], [batch_size])), axis=1)
            updates = tf.zeros((batch_size,), dtype=scores.dtype)
            new_scores = tf.tensor_scatter_nd_update(new_scores, indices, updates)
            
            return new_scores
        
        # 根据当前序列长度 cur_len 和 force_token_array 的长度，决定是否对 scores 进行处理
        scores = tf.cond(
            tf.greater_equal(cur_len, tf.shape(self.force_token_array)[0]),
            # 如果当前长度大于等于 force_token_array 的长度，不进行处理，直接返回 scores
            lambda: tf.identity(scores),
            # 否则，根据 force_token_array 中对应位置的值决定是否强制 token
            lambda: tf.cond(
                tf.greater_equal(self.force_token_array[cur_len], 0),
                # 如果 force_token_array[cur_len] 大于等于 0，调用 _force_token 强制 token
                lambda: _force_token(cur_len),
                # 否则，不进行处理，直接返回 scores
                lambda: scores,
            ),
        )
        
        return scores

`.\generation\tf_utils.py`

# 导入所需的模块和库
import copy  # 导入 copy 模块，用于复制对象
import inspect  # 导入 inspect 模块，用于获取对象信息
import warnings  # 导入 warnings 模块，用于处理警告
from dataclasses import dataclass  # 从 dataclasses 模块导入 dataclass 装饰器
from typing import Any, Dict, Optional, Tuple, Union  # 导入类型提示模块

import numpy as np  # 导入 NumPy 库，并使用别名 np
import tensorflow as tf  # 导入 TensorFlow 库，并使用别名 tf
from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice  # 导入特定函数

# 从相对路径中导入模型输出类
from ..modeling_tf_outputs import TFCausalLMOutputWithPast, TFSeq2SeqLMOutput
# 从相对路径中导入自动模型映射字典
from ..models.auto import (
    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
    TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
    TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
    TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
)
# 从相对路径中导入 TensorFlow 工具函数和稳定 softmax 函数
from ..tf_utils import shape_list, stable_softmax
# 从相对路径中导入模型输出类和日志记录函数
from ..utils import ModelOutput, logging
# 从相对路径中导入生成配置类
from .configuration_utils import GenerationConfig
# 从相对路径中导入 TensorFlow 日志处理相关模块
from .tf_logits_process import (
    TFForcedBOSTokenLogitsProcessor,
    TFForcedEOSTokenLogitsProcessor,
    TFForceTokensLogitsProcessor,
    TFLogitsProcessorList,
    TFMinLengthLogitsProcessor,
    TFNoBadWordsLogitsProcessor,
    TFNoRepeatNGramLogitsProcessor,
    TFRepetitionPenaltyLogitsProcessor,
    TFSuppressTokensAtBeginLogitsProcessor,
    TFSuppressTokensLogitsProcessor,
    TFTemperatureLogitsWarper,
    TFTopKLogitsWarper,
    TFTopPLogitsWarper,
)

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 TFGreedySearchDecoderOnlyOutput 类，继承自 ModelOutput 基类
@dataclass
class TFGreedySearchDecoderOnlyOutput(ModelOutput):
    """
    Base class for outputs of decoder-only generation models using greedy search.
    """
    pass  # 类定义结束
    # 参数列表：
    # sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
    #     生成的序列。第二维 (sequence_length) 要么等于 `max_length`，要么在所有批次由于 `eos_token_id` 而提前结束时要短。
    # scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
    #     语言建模头部的处理预测分数（SoftMax 之前的每个词汇标记的分数）在每个生成步骤。元组的 `tf.Tensor`，最多包含 `max_new_tokens` 个元素（每个生成的标记一个元素），每个张量的形状为 `(batch_size, config.vocab_size)`。
    # attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
    #     元组（每个生成的标记一个元素）的元组（解码器每一层一个元素）的 `tf.Tensor`，形状为 `(batch_size, num_heads, generated_length, sequence_length)`。
    # hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    #     元组（每个生成的标记一个元素）的元组（解码器每一层一个元素）的 `tf.Tensor`，形状为 `(batch_size, generated_length, hidden_size)`。

    sequences: tf.Tensor = None  # 初始化 sequences 变量为 None
    scores: Optional[Tuple[tf.Tensor]] = None  # 初始化 scores 变量为 None，类型为可选的元组
    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None  # 初始化 attentions 变量为 None，类型为可选的元组的元组
    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None  # 初始化 hidden_states 变量为 None，类型为可选的元组的元组
@dataclass
class TFGreedySearchEncoderDecoderOutput(ModelOutput):
    """
    Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
    """

    sequences: tf.Tensor = None
    # 生成的序列，形状为(batch_size, sequence_length)，第二个维度(sequence_length)要么等于max_length，要么因为eos_token_id提前结束而较短
    scores: Optional[Tuple[tf.Tensor]] = None
    # 可选项，当传入output_scores=True或config.output_scores=True时返回，是语言建模头部的处理过的预测分数（SoftMax之前每个词汇标记的分数），每个生成步骤可能有多达max_new_tokens个元素，每个张量形状为(batch_size, config.vocab_size)
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
    # 可选项，当传入output_attentions=True或config.output_attentions=True时返回，元组的每个元素对应解码器每层的注意力张量，形状为(batch_size, num_heads, sequence_length, sequence_length)
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    # 可选项，当传入output_hidden_states=True或config.output_hidden_states=True时返回，元组的每个元素对应嵌入层和每层输出的隐藏状态张量，形状为(batch_size, sequence_length, hidden_size)
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 可选项，当传入output_attentions=True或config.output_attentions=True时返回，元组的每个元素对应每个生成的标记，每层解码器的注意力张量元组，形状为(batch_size, num_heads, generated_length, sequence_length)
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 可选项，当传入output_attentions=True或config.output_attentions=True时返回，元组的每个元素对应每个生成的标记，每层解码器的交叉注意力张量元组，形状为(batch_size, num_heads, generated_length, sequence_length)
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 可选项，当传入output_hidden_states=True或config.output_hidden_states=True时返回，元组的每个元素对应每个生成的标记，每层解码器的隐藏状态张量元组，形状为(batch_size, generated_length, hidden_size)
    # 定义一个可选的变量，用于存储编码器的隐藏状态（Tensor 的元组）。初始值为 None。
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    
    # 定义一个可选的变量，用于存储解码器注意力权重（Tensor 元组的元组）。初始值为 None。
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    
    # 定义一个可选的变量，用于存储交叉注意力权重（Tensor 元组的元组）。初始值为 None。
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    
    # 定义一个可选的变量，用于存储解码器的隐藏状态（Tensor 元组的元组）。初始值为 None。
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
@dataclass
class TFSampleDecoderOnlyOutput(ModelOutput):
    """
    Decoder-only生成模型使用采样生成的输出的基类。

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            生成的序列。第二个维度（sequence_length）要么等于`max_length`，要么比`eos_token_id`提前结束。
        scores (`tuple(tf.Tensor)` *optional*, 当传入`output_scores=True`或者`config.output_scores=True`时返回):
            语言建模头部的处理过的预测分数（SoftMax之前的每个词汇标记的分数）在每个生成步骤中。
            元组中包含最多`max_new_tokens`个元素（每个生成的词汇标记一个元素），每个张量的形状为`(batch_size*num_return_sequences, config.vocab_size)`。
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, 当传入`output_attentions=True`或者`config.output_attentions=True`时返回):
            每个生成的词汇标记的元组（每个生成的词汇标记一个元素），其中包含解码器每一层的注意力张量。
            注意力张量的形状为`(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`。
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, 当传入`output_hidden_states=True`或者`config.output_hidden_states=True`时返回):
            每个生成的词汇标记的元组（每个生成的词汇标记一个元素），其中包含解码器每一层的隐藏状态张量。
            隐藏状态张量的形状为`(num_return_sequences*batch_size, generated_length, hidden_size)`。
    """

    sequences: tf.Tensor = None
    scores: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None


@dataclass
class TFSampleEncoderDecoderOutput(ModelOutput):
    """
    Encoder-decoder生成模型使用采样生成的输出的基类。可以通过encoder_attentions和encoder_hidden_states属性（分别通过decoder_attentions和decoder_hidden_states属性）访问解码器（分别是编码器）的隐藏状态和注意力权重。

    """
    # 定义函数的参数和它们的类型注解，这些参数用于接收生成的序列、预测分数、编码器注意力、编码器隐藏状态、
    # 解码器注意力、交叉注意力以及解码器隐藏状态。这些参数都是可选的，根据函数调用时传递的参数决定是否使用。
    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            生成的序列。第二维（sequence_length）要么等于 `max_length`，要么因为 `eos_token_id` 导致所有批次提前结束而更短。
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            语言建模头部处理后的预测分数（在SoftMax之前的每个词汇标记的分数），每一代步骤有一个元组，包含最多 `max_new_tokens` 个元素，
            每个张量的形状为 `(batch_size*num_return_sequences, config.vocab_size)`。
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            编码器注意力的元组（每个解码器层一个张量），形状为 `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`。
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            编码器隐藏状态的元组（每个解码器层一个张量），形状为 `(batch_size*num_return_sequences, sequence_length, hidden_size)`。
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            解码器注意力的元组（每个生成的令牌一个元组，每个解码器层一个张量），形状为 `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`。
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            交叉注意力的元组（每个生成的令牌一个元组，每个解码器层一个张量），形状为 `(batch_size, num_heads, generated_length, sequence_length)`。
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            解码器隐藏状态的元组（每个生成的令牌一个元组，每个解码器层一个张量），形状为 `(batch_size*num_return_sequences, generated_length, hidden_size)`。
    
    # 初始化所有参数为 None，表示这些参数在函数调用时可以不传递，或者传递为 None，函数会根据需要进行处理。
    sequences: tf.Tensor = None
    scores: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
# 使用 dataclass 装饰器定义 TFBeamSearchDecoderOnlyOutput 类，表示仅解码器使用 beam search 生成模型的输出。
@dataclass
class TFBeamSearchDecoderOnlyOutput(ModelOutput):
    """
    Base class for outputs of decoder-only generation models using beam search.
    解码器仅使用 beam search 生成模型的输出的基类。

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
            生成的序列。第二维度（sequence_length）要么等于 `max_length`，要么由于 `eos_token_id` 导致所有批次提前结束而更短。
        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
            生成的 `sequences` 的最终 beam 分数。
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
            每一代生成步骤中每个词汇标记的处理过的 beam 分数。包括每个词汇标记的 log softmax 分数和该 beam 中先前生成的标记的 log softmax 的总和。
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
            每个生成步骤生成的标记 id 的 beam 索引。形状为 `(batch_size*num_return_sequences, sequence_length)` 的 `tf.Tensor`。
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
            每个生成的标记的元组（每个解码器层的一个元素）的元组（每个生成的标记的元素）的注意力张量。形状为 `(batch_size*num_beams, num_heads, generated_length, sequence_length)` 的 `tf.Tensor`。
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
            每个生成的标记的元组（每个解码器层的一个元素）的元组（每个生成的标记的元素）的隐藏状态张量。形状为 `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)` 的 `tf.Tensor`。
    """

    sequences: tf.Tensor = None  # 生成的序列
    sequences_scores: Optional[tf.Tensor] = None  # 生成序列的最终 beam 分数，可选
    scores: Optional[Tuple[tf.Tensor]] = None  # 每个生成步骤中每个词汇标记的处理过的 beam 分数，可选
    beam_indices: Optional[tf.Tensor] = None  # 每个生成步骤生成的标记 id 的 beam 索引，可选
    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None  # 每个生成的标记的注意力张量，可选
    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None  # 每个生成的标记的隐藏状态张量，可选


# 使用 dataclass 装饰器定义 TFBeamSearchEncoderDecoderOutput 类，表示编码器-解码器使用 beam search 生成模型的输出。
@dataclass
class TFBeamSearchEncoderDecoderOutput(ModelOutput):
    """
    Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights
    of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
    编码器-解码器使用 beam search 生成模型的输出的基类。可以通过 encoder_attentions 和 encoder_hidden_states 访问解码器（或编码器）的隐藏状态和注意力权重。

    """
    # 定义一个包含多个属性的数据类，用于存储序列、分数、索引以及各种注意力和隐藏状态信息
    
    sequences: tf.Tensor = None
    # 序列数据，类型为 TensorFlow 的张量，初始值为 None
    sequences_scores: Optional[tf.Tensor] = None
    # 序列的分数数据，类型为可选的 TensorFlow 张量，初始值为 None
    scores: Optional[Tuple[tf.Tensor]] = None
    # 分数数据，类型为可选的 TensorFlow 张量元组，初始值为 None
    beam_indices: Optional[tf.Tensor] = None
    # Beam 搜索的索引数据，类型为可选的 TensorFlow 张量，初始值为 None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
    # 编码器注意力数据，类型为可选的 TensorFlow 张量元组，初始值为 None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    # 编码器隐藏状态数据，类型为可选的 TensorFlow 张量元组，初始值为 None
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 解码器注意力数据，类型为可选的嵌套元组的 TensorFlow 张量元组，初始值为 None
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 交叉注意力数据，类型为可选的嵌套元组的 TensorFlow 张量元组，初始值为 None
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
    # 解码器隐藏状态数据，类型为可选的嵌套元组的 TensorFlow 张量元组，初始值为 None
@dataclass
class TFBeamSampleDecoderOnlyOutput(ModelOutput):
    """
    Decoder-only生成模型使用Beam采样的输出的基类。

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            生成的序列。第二维(sequence_length)要么等于`max_length`，要么因为`eos_token_id`导致所有批次提前结束而更短。
        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            生成的`sequences`的最终beam分数。
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            每个生成步骤中每个词汇标记的处理beam分数。每个元素为`tf.Tensor`的元组，最多有`max_new_tokens`个元素（每个生成的标记一个元素），每个张量的形状为`(batch_size*num_beams*num_return_sequences, config.vocab_size)`。
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            每个生成步骤生成的标记ID的beam索引。形状为`(batch_size*num_return_sequences, sequence_length)`的`tf.Tensor`。
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            每个生成的标记的注意力权重。元组（每个生成标记一个元素），元组（每个解码器层一个元素），`tf.Tensor`的元组，形状为`(batch_size*num_beams, num_heads, generated_length, sequence_length)`。
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            解码器每层的隐藏状态。元组（每个生成标记一个元素），元组（每个解码器层一个元素），`tf.Tensor`的元组，形状为`(batch_size*num_beams, generated_length, hidden_size)`。
    """

    sequences: tf.Tensor = None
    sequences_scores: Optional[tf.Tensor] = None
    scores: Optional[Tuple[tf.Tensor]] = None
    beam_indices: Optional[tf.Tensor] = None
    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None


@dataclass
class TFBeamSampleEncoderDecoderOutput(ModelOutput):
    """
    Encoder-decoder生成模型使用Beam采样的输出的基类。可以通过encoder_attentions和encoder_hidden_states属性访问解码器（或者通过decoder_attentions和decoder_hidden_states属性访问编码器）的隐藏状态和注意力权重。

    """
    # 定义一个变量 sequences，类型为 tf.Tensor，初始值为 None
    sequences: tf.Tensor = None
    
    # 定义一个变量 sequences_scores，类型为 Optional[tf.Tensor]，初始值为 None
    sequences_scores: Optional[tf.Tensor] = None
    
    # 定义一个变量 scores，类型为 Optional[Tuple[tf.Tensor]]，初始值为 None
    scores: Optional[Tuple[tf.Tensor]] = None
    
    # 定义一个变量 beam_indices，类型为 Optional[tf.Tensor]，初始值为 None
    beam_indices: Optional[tf.Tensor] = None
    
    # 定义一个变量 encoder_attentions，类型为 Optional[Tuple[tf.Tensor]]，初始值为 None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
    
    # 定义一个变量 encoder_hidden_states，类型为 Optional[Tuple[tf.Tensor]]，初始值为 None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    
    # 定义一个变量 decoder_attentions，类型为 Optional[Tuple[Tuple[tf.Tensor]]]，初始值为 None
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    
    # 定义一个变量 cross_attentions，类型为 Optional[Tuple[Tuple[tf.Tensor]]]，初始值为 None
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    
    # 定义一个变量 decoder_hidden_states，类型为 Optional[Tuple[Tuple[tf.Tensor]]]，初始值为 None
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
@dataclass
class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
    """
    Decoder-only generation model output class for contrastive search.

    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
    """

    sequences: tf.Tensor = None
    scores: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None


@dataclass
class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
    """
    Encoder-decoder generation model output class for contrastive search.

    Base class for outputs of encoder-decoder generation models using contrastive search. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
    """
    """
    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            生成的序列。第二个维度 (sequence_length) 可能等于 `max_length`，或者如果所有批次由于 `eos_token_id` 而提前结束，则会更短。
        scores (`tuple(tf.Tensor)` *optional*, 当 `output_scores=True` 传递或 `config.output_scores=True` 时返回):
            语言建模头部处理后的预测分数（SoftMax 前每个词汇标记的分数），每个生成步骤一个元组元素，元素数最多为 `max_new_tokens`，每个张量的形状为 `(batch_size, config.vocab_size)`。
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, 当 `output_attentions=True` 传递或 `config.output_attentions=True` 时返回):
            解码器每一层的注意力权重张量的元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, 当 `output_hidden_states=True` 传递或 `config.output_hidden_states=True` 时返回):
            解码器每一层的隐藏状态张量的元组，形状为 `(batch_size, sequence_length, hidden_size)`，包含从嵌入层开始的所有层的输出。
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, 当 `output_attentions=True` 传递或 `config.output_attentions=True` 时返回):
            每个生成的标记一个元组元素，其中每个元素是解码器每一层的注意力权重张量的元组，形状为 `(batch_size, num_heads, generated_length, sequence_length)`。
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, 当 `output_attentions=True` 传递或 `config.output_attentions=True` 时返回):
            每个生成的标记一个元组元素，其中每个元素是解码器每一层与编码器的交叉注意力权重张量的元组，形状为 `(batch_size, num_heads, generated_length, sequence_length)`。
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, 当 `output_hidden_states=True` 传递或 `config.output_hidden_states=True` 时返回):
            每个生成的标记一个元组元素，其中每个元素是解码器每一层的隐藏状态张量的元组，形状为 `(batch_size, generated_length, hidden_size)`。
    """

    sequences: tf.Tensor = None
    scores: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    cross_attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
    decoder_hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
# 定义类型别名，表示不同的生成器输出类型
TFGreedySearchOutput = Union[TFGreedySearchEncoderDecoderOutput, TFGreedySearchDecoderOnlyOutput]
TFSampleOutput = Union[TFSampleEncoderDecoderOutput, TFSampleDecoderOnlyOutput]
TFBeamSearchOutput = Union[TFBeamSearchEncoderDecoderOutput, TFBeamSearchDecoderOnlyOutput]
TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoderOnlyOutput]
TFContrastiveSearchOutput = Union[TFContrastiveSearchEncoderDecoderOutput, TFContrastiveSearchDecoderOnlyOutput]
# 定义一个类型别名，表示所有生成器可能的输出类型
TFGenerateOutput = Union[
    TFGreedySearchOutput, TFSampleOutput, TFBeamSearchOutput, TFBeamSampleOutput, TFContrastiveSearchOutput
]

class TFGenerationMixin:
    """
    包含支持生成的所有函数的类，用作[`TFPreTrainedModel`]中的混合类。

    该类公开[`~generation.TFGenerationMixin.generate`]，可以用于：
        - 当`num_beams=1`且`do_sample=False`时通过调用[`~generation.TFGenerationMixin.greedy_search`]进行*贪婪解码*
        - 当`penalty_alpha>0`且`top_k>1`时通过调用[`~generation.TFGenerationMixin.contrastive_search`]进行*对比搜索*
        - 当`num_beams=1`且`do_sample=True`时通过调用[`~generation.TFGenerationMixin.sample`]进行*多项式采样*
        - 当`num_beams>1`时通过调用[`~generation.TFGenerationMixin.beam_search`]进行*束搜索解码*

    不需要直接调用上述任何方法。而是将自定义参数值传递给 'generate' 方法。有关解码策略的更多信息，请参阅[text generation strategies guide](../generation_strategies)。
    """

    _seed_generator = None

    @property
    def seed_generator(self):
        # 警告：`seed_generator`已弃用，并将在未来版本中移除。
        warnings.warn("`seed_generator` is deprecated and will be removed in a future version.", UserWarning)
        if self._seed_generator is None:
            # 如果尚未初始化种子生成器，则从不确定状态创建一个随机生成器
            self._seed_generator = tf.random.Generator.from_non_deterministic_state()
        return self._seed_generator

    # 表示该类支持 XLA 生成
    supports_xla_generation = True

    def prepare_inputs_for_generation(self, *args, **kwargs):
        # 如果模型类想要使用 `generate` 方法，需要定义 `prepare_inputs_for_generation` 方法
        raise NotImplementedError(
            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`."
        )

    def compute_transition_scores(
        self,
        sequences: tf.Tensor,
        scores: Tuple[tf.Tensor],
        beam_indices: Optional[tf.Tensor] = None,
        normalize_logits: bool = False,
    def _validate_model_class(self):
        """
        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
        right class to use.
        """
        # 检查当前模型类是否可以生成文本
        if not self.can_generate():
            # 定义兼容生成操作的模型映射列表
            generate_compatible_mappings = [
                TF_MODEL_FOR_CAUSAL_LM_MAPPING,
                TF_MODEL_FOR_VISION_2_SEQ_MAPPING,
                TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
                TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
            ]
            generate_compatible_classes = set()
            # 遍历每个模型映射，检查当前模型类是否在其支持的模型中
            for model_mapping in generate_compatible_mappings:
                supported_models = model_mapping.get(type(self.config), default=None)
                if supported_models is not None:
                    generate_compatible_classes.add(supported_models.__name__)
            # 构建异常消息，指示当前模型类不支持生成操作，并推荐可用的兼容模型类
            exception_message = (
                f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
                "it doesn't have a language model head."
            )
            if generate_compatible_classes:
                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
            # 抛出类型错误异常，包含详细的错误消息
            raise TypeError(exception_message)

    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
        """Validates model kwargs for generation. Generate argument typos will also be caught here."""
        # 如果是编码-解码模型，排除在调用任何模型函数之前已处理的参数
        if self.config.is_encoder_decoder:
            for key in ["decoder_input_ids"]:
                model_kwargs.pop(key, None)

        unused_model_args = []
        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
        # 检查是否 prepare_inputs_for_generation 方法接受了 `kwargs` 或 `model_kwargs` 参数，以便处理可选的前向传递输入
        if "kwargs" in model_args or "model_kwargs" in model_args:
            model_args |= set(inspect.signature(self.call).parameters)
        # 检查每个传入的 model_kwargs 是否在模型参数中有对应的接收者
        for key, value in model_kwargs.items():
            if value is not None and key not in model_args:
                unused_model_args.append(key)

        if unused_model_args:
            # 抛出数值错误异常，指示有未使用的 model_kwargs 参数
            raise ValueError(
                f"The following `model_kwargs` are not used by the model: {unused_model_args} (note: typos in the"
                " generate arguments will also show up in this list)"
            )
        ) -> tf.Tensor:
        # 检查输入是否为 input_ids 类型且是二维的，并且数据类型为 tf.int32 或 tf.int64
        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in (tf.int32, tf.int64)
        # 检查输入中是否存在 pad_token_id，并且在 inputs 中至少有一个元素等于 pad_token_id
        is_pad_token_in_inputs = (pad_token_id is not None) and tf.math.reduce_any(inputs == pad_token_id)
        # 检查 pad_token_id 是否不等于 eos_token_id（如果 eos_token_id 为 None，则始终为 True）
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id)

        # 如果输入是 input_ids 且存在 pad_token_id 且 pad_token_id 不等于 eos_token_id，则生成 attention_mask
        if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
            return tf.cast(tf.math.not_equal(inputs, pad_token_id), dtype=tf.int32)
        else:
            # 否则返回一个全为 1 的 tensor，形状为 inputs.shape[:2]
            return tf.ones(inputs.shape[:2], dtype=tf.int32)

    def _prepare_encoder_decoder_kwargs_for_generation(
        self, inputs_tensor: tf.Tensor, model_kwargs, model_input_name: Optional[str] = None
    ) -> Dict[str, Any]:
        # 1. 获取编码器并存储编码器输出
        encoder = self.get_encoder()

        # 2. 从 model_kwargs 中准备编码器参数和编码器关键字参数
        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
        # 从 model_kwargs 中筛选出不以 irrelevant_prefix 开头的参数作为编码器参数
        encoder_kwargs = {
            argument: value
            for argument, value in model_kwargs.items()
            if not any(argument.startswith(p) for p in irrelevant_prefix)
        }
        # 检查编码器的调用签名，将符合签名的参数留下来
        encoder_signature = set(inspect.signature(encoder.call).parameters)
        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
        if not encoder_accepts_wildcard:
            encoder_kwargs = {
                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
            }

        # 3. 视觉模型不使用 `attention_mask`
        encoder_kwargs["return_dict"] = True
        encoder_kwargs[model_input_name] = inputs_tensor
        # 如果 model_input_name 不是 self.main_input_name，在 Keras 中必须始终传递第一个输入
        if model_input_name != self.main_input_name:
            encoder_kwargs[self.main_input_name] = None
        # 调用编码器并将编码器输出存储在 model_kwargs 中的 "encoder_outputs" 键下
        encoder_outputs = encoder(**encoder_kwargs)
        model_kwargs["encoder_outputs"] = encoder_outputs

        return model_kwargs

    def _prepare_decoder_input_ids_for_generation(
        self,
        batch_size: int,
        model_input_name: str,
        model_kwargs: Dict[str, tf.Tensor],
        decoder_start_token_id: int = None,
        bos_token_id: int = None,
    ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
        # 1. 检查用户是否手动定义了 `decoder_input_ids`。为了方便输入命名，如果编码器没有将其用作主输入，也允许用户通过 `input_ids` 参数传递。
        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
        # 如果 `input_ids` 在 `model_kwargs` 中，并且 `model_input_name` 不是 "input_ids"，则也将其用作 `decoder_input_ids`
        elif "input_ids" in model_kwargs and model_input_name != "input_ids":
            decoder_input_ids = model_kwargs.pop("input_ids")
        else:
            # 否则，将 `decoder_input_ids` 设为 None
            decoder_input_ids = None

        # 2. 编码器-解码器模型期望 `decoder_input_ids` 以特殊标记开始。确保它符合这个要求。
        # 获取解码器的起始标记 ID
        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
        # 用 `decoder_start_token_id` 创建起始的 `decoder_input_ids` 张量
        decoder_input_ids_start = tf.ones((batch_size, 1), dtype=tf.int32) * decoder_start_token_id

        # 如果没有用户输入 -> 使用 `decoder_start_token_id` 作为 `decoder_input_ids`
        if decoder_input_ids is None:
            decoder_input_ids = decoder_input_ids_start
        # 如果有用户输入但不以 `decoder_start_token_id` 开始 -> 在开头添加 `decoder_start_token_id`（并调整 `decoder_attention_mask` 如果提供了）
        elif tf.reduce_all(decoder_input_ids[:, 0] != decoder_start_token_id):
            decoder_input_ids = tf.concat([decoder_input_ids_start, decoder_input_ids], axis=-1)
            if "decoder_attention_mask" in model_kwargs:
                # 调整 `decoder_attention_mask`，在开头增加一个标记
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                decoder_attention_mask = tf.concat(
                    (tf.ones_like(decoder_attention_mask)[:, :1], decoder_attention_mask),
                    axis=-1,
                )
                model_kwargs["decoder_attention_mask"] = decoder_attention_mask

        # 返回处理后的 `decoder_input_ids` 和可能修改过的 `model_kwargs`
        return decoder_input_ids, model_kwargs

    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
        # 检索编码器-解码器模型的解码器起始标记 ID
        # 如果需要，回退到 `bos_token_id`
        decoder_start_token_id = (
            decoder_start_token_id
            if decoder_start_token_id is not None
            else self.generation_config.decoder_start_token_id
        )
        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id

        # 如果 `decoder_start_token_id` 已定义，则返回它
        if decoder_start_token_id is not None:
            return decoder_start_token_id
        # 否则，如果 `bos_token_id` 已定义，则返回它
        elif bos_token_id is not None:
            return bos_token_id
        # 如果两者都未定义，则引发 ValueError 异常
        raise ValueError(
            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
        )
    def _expand_inputs_for_generation(
        expand_size: int = 1,
        is_encoder_decoder: bool = False,
        input_ids: Optional[tf.Tensor] = None,
        expand_in_new_axis: bool = False,
        **model_kwargs,
    ) -> Tuple[tf.Tensor, Dict[str, Any]]:
        """
        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
        depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
        `expand_in_new_axis=True`
        """
        
        def _expand_tensor(tensor: tf.Tensor):
            # 根据 `expand_in_new_axis` 参数选择不同的扩展方式
            if expand_in_new_axis:
                shape = shape_list(tensor)
                return tf.broadcast_to(tensor[:, None], (shape[0], expand_size) + tuple(shape[1:]))
            else:
                return tf.repeat(tensor, expand_size, axis=0)

        def _expand_dict_for_generation(dict_to_expand):
            # 遍历字典中的每个值，如果是 Tensor 类型且非空，则调用 `_expand_tensor` 函数扩展
            for key in dict_to_expand:
                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], tf.Tensor):
                    dict_to_expand[key] = _expand_tensor(dict_to_expand[key])
            return dict_to_expand

        if input_ids is not None:
            # 如果 `input_ids` 不为空，则调用 `_expand_tensor` 函数扩展 `input_ids`
            input_ids = _expand_tensor(input_ids)

        # 调用 `_expand_dict_for_generation` 函数扩展 `model_kwargs`
        model_kwargs = _expand_dict_for_generation(model_kwargs)

        if is_encoder_decoder:
            # 如果是编码-解码模型，确保 `encoder_outputs` 在 `model_kwargs` 中定义
            if model_kwargs.get("encoder_outputs") is None:
                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
            # 调用 `_expand_dict_for_generation` 函数扩展 `encoder_outputs`
            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])

        # 返回扩展后的 `input_ids` 和 `model_kwargs`
        return input_ids, model_kwargs

    def _prepare_model_inputs(
        self,
        inputs: Optional[tf.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
    ):
        """
        Prepares inputs for the model, optionally including a beginning-of-sequence token ID (`bos_token_id`).
        """
        # 此函数未提供实现，仅作为方法声明，用于准备模型的输入

    def _maybe_initialize_input_ids_for_generation(
        self,
        inputs: Optional[tf.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
    ):
        """
        Initializes `input_ids` for generation, optionally including a beginning-of-sequence token ID (`bos_token_id`).
        """
        # 此函数未提供实现，仅作为方法声明，用于为生成任务初始化 `input_ids`
    ) -> tf.Tensor:
        """Initializes input ids for generation, if necessary."""
        # 如果已经提供了输入，则直接返回输入
        if inputs is not None:
            return inputs

        # 获取模型参数中的编码器输出
        encoder_outputs = model_kwargs.get("encoder_outputs")
        # 如果是编码-解码模型并且有编码器输出，则创建一个全为-100的虚拟输入，以确保不会被用于编码
        if self.config.is_encoder_decoder and encoder_outputs is not None:
            shape = encoder_outputs.last_hidden_state.shape[:-1]
            return tf.ones(shape, dtype=tf.int32) * -100

        # 如果未提供输入且未定义bos_token_id，则抛出异常
        if bos_token_id is None:
            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")

        # 如果在 `model_kwargs` 中有张量，则可以从中推断批量大小。这对于软提示或基于解码器的多模态实现很有帮助。
        batch_size = 1
        for value in model_kwargs.values():
            if isinstance(value, tf.Tensor):
                batch_size = value.shape[0]
                break
        # 创建一个形状为(batch_size, 1)的全为bos_token_id的张量，作为初始化的输入
        return tf.ones((batch_size, 1), dtype=tf.int32) * bos_token_id

    @staticmethod
    def _extract_past_from_model_output(outputs: ModelOutput):
        """Extracts past key values from model outputs."""
        past_key_values = None
        # 根据不同的输出结构，提取过去的键值
        if "past_key_values" in outputs:
            past_key_values = outputs.past_key_values
        elif "mems" in outputs:
            past_key_values = outputs.mems
        elif "past_buckets_states" in outputs:
            past_key_values = outputs.past_buckets_states
        return past_key_values

    def _update_model_kwargs_for_generation(
        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
    ) -> Dict[str, Any]:
        """Updates model keyword arguments for generation."""
        # 更新模型参数中的过去键值
        model_kwargs["past_key_values"] = self._extract_past_from_model_output(outputs)

        # 更新注意力掩码
        if not is_encoder_decoder:
            if "attention_mask" in model_kwargs:
                attention_mask = model_kwargs["attention_mask"]
                # 将一个形状为原注意力掩码+1列的张量与原注意力掩码拼接，用于后续生成过程中的扩展
                model_kwargs["attention_mask"] = tf.concat(
                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
                )

        return model_kwargs

    def _update_model_kwargs_for_xla_generation(
        self,
        model_outputs: ModelOutput,
        model_kwargs: Dict[str, Any],
        cur_len: int,
        max_length: int,
        batch_size: int,
        is_encoder_decoder: bool = False,
        batch_axis: int = 0,
    ):
        """Updates model keyword arguments for XLA generation."""
        # 省略部分代码，未作注释要求的部分
        pass

    def _get_logits_warper(
        self,
        generation_config: GenerationConfig,
        # 省略部分代码，未作注释要求的部分
        ):
        """Gets the logits warper for generation."""
        pass
        ) -> TFLogitsProcessorList:
        """
        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsWarper`]
        instances used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = TFLogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(generation_config.eos_token_id) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config.eos_token_id, list):
                min_tokens_to_keep = len(generation_config.eos_token_id) + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # Check if temperature warping is enabled and add warper accordingly
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TFTemperatureLogitsWarper(generation_config.temperature))
        
        # Check if top-k warping is enabled and add warper accordingly
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TFTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        
        # Check if top-p warping is enabled and add warper accordingly
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TFTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        
        # Return the list of warpers containing all configured logits processors
        return warpers
    ) -> TFLogitsProcessorList:
        """
        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
        instances used to modify the scores of the language model head.
        """
        # 创建一个空的处理器列表
        processors = TFLogitsProcessorList()

        # 如果设定了重复惩罚并且不等于默认值 1.0，则添加重复惩罚处理器
        if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
            processors.append(TFRepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
        
        # 如果设定了禁止重复 n-gram 大小，并且大于 0，则添加禁止重复 n-gram 处理器
        if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
            processors.append(TFNoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
        
        # 如果设定了要避免的词汇 ID 列表，则添加避免坏词汇处理器
        if generation_config.bad_words_ids is not None:
            processors.append(
                TFNoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
            )
        
        # 如果设定了最小生成长度、结束符号 ID，并且最小长度大于 0，则添加最小长度处理器
        if (
            generation_config.min_length is not None
            and generation_config.eos_token_id is not None
            and generation_config.min_length > 0
        ):
            processors.append(TFMinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id))
        
        # 如果设定了强制起始符号 ID，则添加强制起始符号处理器
        if generation_config.forced_bos_token_id is not None:
            processors.append(TFForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id))
        
        # 如果设定了强制结束符号 ID，则添加强制结束符号处理器
        if generation_config.forced_eos_token_id is not None:
            processors.append(
                TFForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id)
            )
        
        # 如果设定了要抑制的 token 列表，则添加抑制 token 处理器
        if generation_config.suppress_tokens is not None:
            processors.append(TFSuppressTokensLogitsProcessor(generation_config.suppress_tokens))
        
        # 如果设定了要在开头抑制的 token 列表，则添加在开头抑制 token 处理器
        if generation_config.begin_suppress_tokens is not None:
            begin_index = input_ids_seq_length
            begin_index = (
                begin_index
                if (input_ids_seq_length > 1 or generation_config.forced_bos_token_id is None)
                else begin_index + 1
            )
            if generation_config.forced_decoder_ids is not None:
                begin_index += generation_config.forced_decoder_ids[-1][
                    0
                ]  # generation starts after the last token that is forced
            processors.append(
                TFSuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index)
            )
        
        # 如果设定了强制生成的 token 列表，则添加强制生成 token 处理器
        if generation_config.forced_decoder_ids is not None:
            processors.append(TFForceTokensLogitsProcessor(generation_config.forced_decoder_ids))

        # 合并默认处理器列表和自定义处理器列表，并返回最终的处理器列表
        processors = self._merge_criteria_processor_list(processors, logits_processor)
        return processors
    # 定义一个方法，接受一个自定义的 TFLogitsProcessorList 参数列表，并返回一个 TFLogitsProcessorList 对象
    def __init__(self, custom_list: List[TFLogitsProcessor] = []) -> TFLogitsProcessorList:
        # 如果 custom_list 是空的，则返回默认的 default_list
        if len(custom_list) == 0:
            return default_list
        # 遍历 default_list 中的每个元素
        for default in default_list:
            # 遍历 custom_list 中的每个元素
            for custom in custom_list:
                # 如果 custom 和 default 的类型相同
                if type(custom) is type(default):
                    # 设置对象类型为 "logits processor"
                    object_type = "logits processor"
                    # 抛出值错误异常，提醒用户 custom 对象已经存在
                    raise ValueError(
                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
                        f" `generate`, but it has already been created with the values {default}. {default} has been"
                        " created by passing the corresponding arguments to generate or by the model's config default"
                        f" values. If you just want to change the default values of {object_type} consider passing"
                        f" them as arguments to `generate` instead of using a custom {object_type}."
                    )
        # 将 custom_list 中的元素扩展到 default_list 中
        default_list.extend(custom_list)
        # 返回扩展后的 default_list
        return default_list

    # 定义一个贪婪搜索方法，接受多个参数和关键字参数
    def greedy_search(
        self,
        input_ids: tf.Tensor,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        logits_processor: Optional[TFLogitsProcessorList] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        **model_kwargs,
    ):
        # 方法实现省略

    # 定义一个采样方法，接受多个参数和关键字参数
    def sample(
        self,
        input_ids: tf.Tensor,
        logits_processor: Optional[TFLogitsProcessorList] = None,
        logits_warper: Optional[TFLogitsProcessorList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        seed: Optional[Tuple[int, int]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        **model_kwargs,
    ):
        # 方法实现省略

    @staticmethod
    def _gather_beams(nested, beam_indices, batch_axis=0):
        """Gathers the beam slices indexed by beam_indices into new beam array."""

        def gather_fn(tensor):
            # 如果 batch_axis 大于 0，则将 batch_axis 之前的所有维度移到最后，以便得到形状为 (batch, beam_id, ...) 的张量
            if batch_axis > 0:
                perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                tensor = tf.transpose(tensor, perm=perm)

            # 在 axis=1 上使用 beam_indices 进行 gather 操作，得到聚集后的张量
            gathered_tensor = tf.gather(params=tensor, indices=beam_indices, axis=1, batch_dims=1)
            
            # 如果 batch_axis 大于 0，则将张量恢复到原始的维度顺序
            if batch_axis > 0:
                perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                perm = tf.math.invert_permutation(perm)
                gathered_tensor = tf.transpose(gathered_tensor, perm=perm)

            return gathered_tensor

        # 对 nested 结构中的每个张量应用 gather_fn 函数，并返回新的结构
        return tf.nest.map_structure(gather_fn, nested)
# 将给定的值按照批次索引散布到张量中
def scatter_values_on_batch_indices(values, batch_indices):
    # 获取批次索引张量的形状
    shape = shape_list(batch_indices)
    # 扩展批次维度以匹配形状
    broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
    # 将批次索引转换为对应的索引对
    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
    # 根据索引对将值散布到目标形状中
    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)


def sample_without_replacement(logits, num_samples):
    """
    不重复的分类采样当前尚未实现，现在使用Gumbel-Max技巧代替，请参见
    https://github.com/tensorflow/tensorflow/issues/9260 获取更多信息
    """
    z = -tf.math.log(-tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)))
    _, indices = tf.nn.top_k(logits + z, num_samples)
    return indices


def _ranking_fast(
    context_hidden: tf.Tensor,
    next_hidden: tf.Tensor,
    next_top_k_probs: tf.Tensor,
    alpha: float,
    beam_width: int,
) -> tf.Tensor:
    """
    根据文献《神经文本生成的对比框架》中描述的退化惩罚（与先前标记的余弦相似度）对top_k候选进行重新排序。
    返回批次中每行最佳候选的索引。
    """
    # 对上下文隐藏层进行归一化处理
    norm_context_hidden = context_hidden / tf.norm(context_hidden, axis=2, keepdims=True)
    # 对下一个隐藏层进行归一化处理
    norm_next_hidden = next_hidden / tf.norm(next_hidden, axis=2, keepdims=True)
    # 计算余弦相似度矩阵
    cosine_matrix = tf.squeeze(tf.linalg.matmul(norm_context_hidden, norm_next_hidden, transpose_b=True), axis=-1)
    # 计算最大余弦相似度的退化惩罚
    degeneration_penalty = tf.reduce_max(cosine_matrix, axis=-1)
    # 重塑下一个top_k概率
    next_top_k_probs = tf.reshape(next_top_k_probs, shape=[-1])
    # 计算对比分数，包括概率和退化惩罚
    contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
    contrastive_score = tf.reshape(contrastive_score, shape=[-1, beam_width])
    # 选择每行中最高对比分数的索引
    selected_idx = tf.argmax(contrastive_score, axis=1)
    return selected_idx

`.\generation\utils.py`

# coding=utf-8
# 版权声明和许可信息，指定了本文件使用的Apache License, Version 2.0许可
# 此处为代码导入所需的标准库、第三方库及自定义模块

import copy  # 导入copy模块，用于对象的浅复制和深复制操作
import inspect  # 导入inspect模块，用于获取对象信息
import warnings  # 导入warnings模块，用于警告处理
from dataclasses import dataclass  # 从dataclasses模块导入dataclass装饰器，用于简化数据类的定义
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union  # 导入类型提示相关的类和函数

import torch  # 导入PyTorch库
import torch.distributed as dist  # 导入PyTorch分布式训练相关模块
from torch import nn  # 从torch模块中导入nn模块，用于神经网络构建

from ..cache_utils import Cache, DynamicCache, StaticCache  # 导入缓存相关的自定义模块
from ..integrations.deepspeed import is_deepspeed_zero3_enabled  # 导入深度学习加速相关模块
from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput  # 导入模型输出相关类
from ..models.auto import (  # 导入自动模型加载相关映射
    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
    MODEL_FOR_VISION_2_SEQ_MAPPING,
)
from ..utils import ModelOutput, is_accelerate_available, is_torchdynamo_compiling, logging  # 导入工具类和函数
from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint  # 导入束搜索相关约束类
from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer  # 导入束搜索相关评分器类
from .candidate_generator import (  # 导入候选生成器相关函数和类
    AssistedCandidateGenerator,
    CandidateGenerator,
    PromptLookupCandidateGenerator,
    _crop_past_key_values,
    _prepare_attention_mask,
    _prepare_token_type_ids,
)
from .configuration_utils import GenerationConfig, GenerationMode  # 导入生成配置和模式相关类
from .logits_process import (  # 导入logits处理相关类
    EncoderNoRepeatNGramLogitsProcessor,
    EncoderRepetitionPenaltyLogitsProcessor,
    EpsilonLogitsWarper,
    EtaLogitsWarper,
    ExponentialDecayLengthPenalty,
    ForcedBOSTokenLogitsProcessor,
    ForcedEOSTokenLogitsProcessor,
    ForceTokensLogitsProcessor,
    HammingDiversityLogitsProcessor,
    InfNanRemoveLogitsProcessor,
    LogitNormalization,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    MinNewTokensLengthLogitsProcessor,
    NoBadWordsLogitsProcessor,
    NoRepeatNGramLogitsProcessor,
    PrefixConstrainedLogitsProcessor,
    RepetitionPenaltyLogitsProcessor,
    SequenceBiasLogitsProcessor,
    SuppressTokensAtBeginLogitsProcessor,
    SuppressTokensLogitsProcessor,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    TypicalLogitsWarper,
    UnbatchedClassifierFreeGuidanceLogitsProcessor,
)
from .stopping_criteria import (  # 导入停止条件相关类
    MaxLengthCriteria,
    MaxTimeCriteria,
    StoppingCriteria,
    StoppingCriteriaList,
    validate_stopping_criteria,
)

if TYPE_CHECKING:
    # 从相对路径导入模块中的PreTrainedModel类，用于模型预训练
    # 从相对路径导入streamers模块中的BaseStreamer类，用作基础流处理器
    from ..modeling_utils import PreTrainedModel
    from .streamers import BaseStreamer
# 获取名为__name__的模块的日志记录器对象
logger = logging.get_logger(__name__)

# 如果加速可用，导入加速相关的钩子函数和模块扩展函数
if is_accelerate_available():
    from accelerate.hooks import AlignDevicesHook, add_hook_to_module

# 静态缓存类型映射，将字符串"static"映射到StaticCache类
NEED_SETUP_CACHE_CLASSES_MAPPING = {
    "static": StaticCache,
}

# 数据类，用于生成仅解码器输出的模型结果，继承自ModelOutput类
@dataclass
class GenerateDecoderOnlyOutput(ModelOutput):
    """
    Outputs of decoder-only generation models, when using non-beam methods.
    """
    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True` is passed or when `config.output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            NOTE: some models have a different `past_key_values` format, confirm with the model's documentation.
            Usually a Tuple (one element for each layer of the decoder) of tuples (two elements, key tensor and value
            tensor). The first Tuple is of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
    # 声明一个可选的变量 hidden_states，其类型是一个元组，包含一个元组，该元组中包含一个 torch.FloatTensor 类型的值
    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    
    # 声明一个可选的变量 past_key_values，其类型是一个元组，包含一个元组，该元组中包含一个元组，该元组中包含一个 torch.FloatTensor 类型的值
    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
# 用于生成编码器-解码器模型的输出，非使用 Beam 方法时的情况
@dataclass
class GenerateEncoderDecoderOutput(ModelOutput):
    """
    编码器-解码器生成模型的输出，当不使用 Beam 方法时。

    """

    sequences: torch.LongTensor = None  # 生成的序列（token ID）
    scores: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的分数
    logits: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的 logits
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 编码器注意力权重
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 编码器隐藏状态
    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 解码器注意力权重
    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 交叉注意力权重
    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 解码器隐藏状态
    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None  # 额外的过去键值（针对 Transformer 模型）

# 用于生成仅解码器模型的输出，使用 Beam 方法时的情况
@dataclass
class GenerateBeamDecoderOnlyOutput(ModelOutput):
    """
    解码器生成模型的输出，仅在使用 Beam 方法时。

    """

    sequences: torch.LongTensor = None  # 生成的序列（token ID）
    sequences_scores: Optional[torch.FloatTensor] = None  # 生成序列的分数
    scores: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的分数
    logits: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的 logits
    beam_indices: Optional[torch.LongTensor] = None  # Beam 搜索时使用的索引
    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 注意力权重
    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 隐藏状态
    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None  # 额外的过去键值（针对 Transformer 模型）

# 用于生成编码器-解码器模型的输出，使用 Beam 方法时的情况
@dataclass
class GenerateBeamEncoderDecoderOutput(ModelOutput):
    """
    编码器-解码器生成模型的输出，使用 Beam 方法时。

    """

    sequences: torch.LongTensor = None  # 生成的序列（token ID）
    sequences_scores: Optional[torch.FloatTensor] = None  # 生成序列的分数
    scores: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的分数
    logits: Optional[Tuple[torch.FloatTensor]] = None  # 每个生成序列的 logits
    beam_indices: Optional[torch.LongTensor] = None  # Beam 搜索时使用的索引
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None  # 编码器注意力权重
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 编码器隐藏状态
    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 解码器注意力权重
    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 交叉注意力权重
    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None  # 解码器隐藏状态
    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None  # 额外的过去键值（针对 Transformer 模型）

# 以下是为了向后兼容而保留的等效类
GreedySearchDecoderOnlyOutput = GenerateDecoderOnlyOutput  # 贪婪搜索解码器模型的输出
ContrastiveSearchDecoderOnlyOutput = GenerateDecoderOnlyOutput  # 对比搜索解码器模型的输出
SampleDecoderOnlyOutput = GenerateDecoderOnlyOutput  # 示例解码器模型的输出

ContrastiveSearchEncoderDecoderOutput = GenerateEncoderDecoderOutput  # 对比搜索编码器-解码器模型的输出
GreedySearchEncoderDecoderOutput = GenerateEncoderDecoderOutput  # 贪婪搜索编码器-解码器模型的输出
SampleEncoderDecoderOutput = GenerateEncoderDecoderOutput  # 示例编码器-解码器模型的输出

BeamSearchDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput  # Beam 搜索解码器模型的输出
BeamSampleDecoderOnlyOutput = GenerateBeamDecoderOnlyOutput  # Beam 示例解码器模型的输出

BeamSearchEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput  # Beam 搜索编码器-解码器模型的输出
BeamSampleEncoderDecoderOutput = GenerateBeamEncoderDecoderOutput  # Beam 示例编码器-解码器模型的输出

GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]  # 贪婪搜索的输出类型
# Typing shortcuts for specific types of model outputs
SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput]
BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput]
ContrastiveSearchOutput = Union[ContrastiveSearchEncoderDecoderOutput, ContrastiveSearchDecoderOnlyOutput]

# Typing shortcut for non-beam text generation output
GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput]
# Typing shortcut for beam search text generation output
GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput]
# Typing shortcut for any text generation output
GenerateOutput = Union[GenerateNonBeamOutput, GenerateBeamOutput]


class GenerationMixin:
    """
    A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* by calling [`~generation.GenerationMixin._greedy_search`] if `num_beams=1` and
          `do_sample=False`
        - *contrastive search* by calling [`~generation.GenerationMixin._contrastive_search`] if `penalty_alpha>0` and
          `top_k>1`
        - *multinomial sampling* by calling [`~generation.GenerationMixin._sample`] if `num_beams=1` and
          `do_sample=True`
        - *beam-search decoding* by calling [`~generation.GenerationMixin._beam_search`] if `num_beams>1` and
          `do_sample=False`
        - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin._beam_sample`] if `num_beams>1`
          and `do_sample=True`
        - *diverse beam-search decoding* by calling [`~generation.GenerationMixin._group_beam_search`], if `num_beams>1`
          and `num_beam_groups>1`
        - *constrained beam-search decoding* by calling [`~generation.GenerationMixin._constrained_beam_search`], if
          `constraints!=None` or `force_words_ids!=None`
        - *assisted decoding* by calling [`~generation.GenerationMixin._assisted_decoding`], if
            `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    """

    def prepare_inputs_for_generation(self, *args, **kwargs):
        # Raise an error if this method is not implemented in the subclass
        raise NotImplementedError(
            "A model class needs to define a `prepare_inputs_for_generation` method in order to use `.generate()`."
        )

    def _prepare_model_inputs(
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    ):
        # Internal method for preparing model inputs for text generation
        ...

    def _maybe_initialize_input_ids_for_generation(
        self,
        inputs: Optional[torch.Tensor] = None,
        bos_token_id: Optional[int] = None,
        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
    ):
        # Internal method to initialize input IDs for text generation if necessary
        ...
    ) -> torch.LongTensor:
        """Initializes input ids for generation, if necessary."""
        # 如果已经提供了输入，则直接返回输入
        if inputs is not None:
            return inputs

        # 获取模型关键字参数中的 encoder_outputs
        encoder_outputs = model_kwargs.get("encoder_outputs")
        # 如果模型是编码-解码模型且 encoder_outputs 不为空
        if self.config.is_encoder_decoder and encoder_outputs is not None:
            # 创建一个与 encoder_outputs 最后一层隐藏状态相同形状的输入 id 张量，填充值为 -100
            shape = encoder_outputs.last_hidden_state.size()[:-1]
            return torch.ones(shape, dtype=torch.long, device=self.device) * -100

        # 如果未提供 input_ids 且未定义 bos_token_id，则引发错误
        if bos_token_id is None:
            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")

        # 如果 model_kwargs 中有某些张量，则可以从中推断出批量大小
        batch_size = 1
        for value in model_kwargs.values():
            if isinstance(value, torch.Tensor):
                batch_size = value.shape[0]
                break

        # 如果 model_kwargs 中包含 "inputs_embeds" 键
        if "inputs_embeds" in model_kwargs:
            # 返回一个形状为 (batch_size, 0) 的全 1 张量，dtype 为 torch.long
            return torch.ones((batch_size, 0), dtype=torch.long, device=self.device)
        # 否则返回一个形状为 (batch_size, 1) 的全 bos_token_id 值的张量，dtype 为 torch.long
        return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id

    def _prepare_attention_mask_for_generation(
        self,
        inputs: torch.Tensor,
        pad_token_id: Optional[int],
        eos_token_id: Optional[Union[int, List[int]]],
    ) -> torch.LongTensor:
        # 检查输入是否为 input_ids 且已被填充，只有这种情况下才定义 attention_mask
        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
        is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
        if isinstance(eos_token_id, int):
            eos_token_id = [eos_token_id]
        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id not in eos_token_id)

        # 如果输入是 input_ids 且已填充，并且填充标记不等于 eos_token_id，则返回 attention_mask
        if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
            return inputs.ne(pad_token_id).long()
        else:
            # 否则返回一个形状与 inputs 的前两维相同的全 1 张量，dtype 为 torch.long
            return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)

    def _prepare_encoder_decoder_kwargs_for_generation(
        self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
        # 1. 获取编码器
        encoder = self.get_encoder()

        # 2. 兼容加速大模型推断：确保编码器在与输入相同的设备上输出结果
        if hasattr(self, "hf_device_map"):
            # 如果编码器有 `_hf_hook` 属性，设置其 `io_same_device` 为 True
            if hasattr(encoder, "_hf_hook"):
                encoder._hf_hook.io_same_device = True
            # 否则，向编码器添加一个 AlignDevicesHook，设置 `io_same_device` 为 True
            else:
                add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True))

        # 3. 从模型参数中准备编码器的参数和关键字参数
        irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
        # 从 `model_kwargs` 中选择与编码器相关的参数和值
        encoder_kwargs = {
            argument: value
            for argument, value in model_kwargs.items()
            if not any(argument.startswith(p) for p in irrelevant_prefix)
        }
        # 检查编码器的输入签名，确定是否支持 `kwargs` 或 `model_kwargs`
        encoder_signature = set(inspect.signature(encoder.forward).parameters)
        encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature
        if not encoder_accepts_wildcard:
            # 如果编码器不支持通配符参数，仅选择编码器签名中存在的参数和值
            encoder_kwargs = {
                argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
            }

        # 4. 确保编码器返回 `ModelOutput`
        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
        encoder_kwargs["return_dict"] = True
        encoder_kwargs[model_input_name] = inputs_tensor

        # 调用编码器并将结果保存在 `model_kwargs` 的 `encoder_outputs` 键中
        model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs)

        return model_kwargs

    # 准备用于生成的解码器输入 ID
    def _prepare_decoder_input_ids_for_generation(
        self,
        batch_size: int,
        model_input_name: str,
        model_kwargs: Dict[str, torch.Tensor],
        decoder_start_token_id: Union[int, List[int]] = None,
        bos_token_id: int = None,
        device: torch.device = None,
    ) -> Dict[str, torch.Tensor]:
        ...

    # 获取解码器起始标记 ID
    def _get_decoder_start_token_id(
        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None
    ) -> int:
        ...

    # 扩展用于生成的输入
    @staticmethod
    def _expand_inputs_for_generation(
        expand_size: int = 1,
        is_encoder_decoder: bool = False,
        input_ids: Optional[torch.LongTensor] = None,
        **model_kwargs,
    ) -> Dict[str, torch.Tensor]:
        ...
    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
        # 定义函数签名，指定返回类型为元组，包含一个长整型张量和一个任意类型字典

        def _expand_dict_for_generation(dict_to_expand):
            # 为生成过程扩展字典中的张量
            for key in dict_to_expand:
                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor):
                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
            return dict_to_expand

        if input_ids is not None:
            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
            # 如果输入 ID 不为空，则按照指定的扩展大小在指定维度上重复扩展

        model_kwargs = _expand_dict_for_generation(model_kwargs)
        # 扩展模型参数字典中的张量

        if is_encoder_decoder:
            if model_kwargs.get("encoder_outputs") is None:
                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
            # 如果是编码器-解码器模型，确保编码器输出在模型参数中被定义，并进行扩展

        return input_ids, model_kwargs
        # 返回扩展后的输入 ID 和模型参数字典

    def _extract_past_from_model_output(self, outputs: ModelOutput, standardize_cache_format: bool = False):
        past_key_values = None
        if "past_key_values" in outputs:
            past_key_values = outputs.past_key_values
        elif "mems" in outputs:
            past_key_values = outputs.mems
        elif "past_buckets_states" in outputs:
            past_key_values = outputs.past_buckets_states
        # 从模型输出中提取过去的键-值对

        # Bloom fix: standardizes the cache format when requested
        if standardize_cache_format and hasattr(self, "_convert_to_standard_cache"):
            batch_size = outputs.logits.shape[0]
            past_key_values = self._convert_to_standard_cache(past_key_values, batch_size=batch_size)
            # 在请求时，如果需要，标准化缓存格式

        return past_key_values
        # 返回提取的过去键-值对

    def _update_model_kwargs_for_generation(
        self,
        outputs: ModelOutput,
        model_kwargs: Dict[str, Any],
        is_encoder_decoder: bool = False,
        standardize_cache_format: bool = False,
        # 更新用于生成的模型参数字典
    ) -> Dict[str, Any]:
        # 更新 model_kwargs 中的 past_key_values，从模型输出中提取过去的键值
        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
            outputs, standardize_cache_format=standardize_cache_format
        )
        # 如果 outputs 有 state 属性，则更新 model_kwargs 中的 state
        if getattr(outputs, "state", None) is not None:
            model_kwargs["state"] = outputs.state

        # 更新 token_type_ids，使用最后一个值进行扩展
        if "token_type_ids" in model_kwargs:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)

        # 如果不是 encoder-decoder 架构
        if not is_encoder_decoder:
            # 更新 attention_mask
            if "attention_mask" in model_kwargs:
                attention_mask = model_kwargs["attention_mask"]
                model_kwargs["attention_mask"] = torch.cat(
                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
                )
        else:
            # 更新 decoder_attention_mask
            if "decoder_attention_mask" in model_kwargs:
                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
                model_kwargs["decoder_attention_mask"] = torch.cat(
                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
                    dim=-1,
                )

        # 如果 model_kwargs 中存在 cache_position 并且不为 None，则更新 cache_position
        if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None:
            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1

        # 返回更新后的 model_kwargs
        return model_kwargs

    # 抛出未实现错误，提示在当前类的模块中实现 _reorder_cache 函数以启用 beam search
    def _reorder_cache(self, past_key_values, beam_idx):
        raise NotImplementedError(
            f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to"
            f" enable beam search for {self.__class__}"
        )

    # 返回用于辅助生成的候选生成器
    def _get_candidate_generator(
        self,
        generation_config: GenerationConfig,
        input_ids: torch.LongTensor,
        inputs_tensor: torch.Tensor,
        assistant_model: "PreTrainedModel",
        logits_processor: LogitsProcessorList,
        model_kwargs: Dict,
    ) -> CandidateGenerator:
        """
        Returns the candidate generator to be used in `assisted_generation`
        """
        # 如果指定了 prompt_lookup_num_tokens，则使用 PromptLookupCandidateGenerator
        if generation_config.prompt_lookup_num_tokens is not None:
            candidate_generator = PromptLookupCandidateGenerator(
                num_output_tokens=generation_config.prompt_lookup_num_tokens,
                max_matching_ngram_size=generation_config.max_matching_ngram_size,
            )
        else:
            # 否则使用 AssistedCandidateGenerator
            candidate_generator = AssistedCandidateGenerator(
                input_ids=input_ids,
                assistant_model=assistant_model,
                generation_config=generation_config,
                logits_processor=logits_processor,
                model_kwargs=model_kwargs,
                inputs_tensor=inputs_tensor,
            )
        return candidate_generator
    def _get_logits_warper(
        self,
        generation_config: GenerationConfig,
    ) -> LogitsProcessorList:
        """
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        """

        # instantiate warpers list
        warpers = LogitsProcessorList()

        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
        # better score (i.e. keep len(list(generation_config.eos_token_id)) + 1)
        if generation_config.num_beams > 1:
            if isinstance(generation_config.eos_token_id, list):
                min_tokens_to_keep = len(generation_config.eos_token_id) + 1
            else:
                min_tokens_to_keep = 2
        else:
            min_tokens_to_keep = 1

        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        
        # Apply temperature warping if temperature is defined and not equal to 1.0
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
        
        # Apply top-k warping if top-k is defined and not equal to 0
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        
        # Apply top-p warping if top-p is defined and less than 1.0
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
            warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
        
        # Apply typical-p warping if typical-p is defined and less than 1.0
        if generation_config.typical_p is not None and generation_config.typical_p < 1.0:
            warpers.append(
                TypicalLogitsWarper(mass=generation_config.typical_p, min_tokens_to_keep=min_tokens_to_keep)
            )
        
        # Apply epsilon cutoff warping if epsilon cutoff is defined and within (0, 1)
        if generation_config.epsilon_cutoff is not None and 0.0 < generation_config.epsilon_cutoff < 1.0:
            warpers.append(
                EpsilonLogitsWarper(epsilon=generation_config.epsilon_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        
        # Apply eta cutoff warping if eta cutoff is defined and within (0, 1)
        if generation_config.eta_cutoff is not None and 0.0 < generation_config.eta_cutoff < 1.0:
            warpers.append(
                EtaLogitsWarper(epsilon=generation_config.eta_cutoff, min_tokens_to_keep=min_tokens_to_keep)
            )
        
        # `LogitNormalization` should always be the last logit processor, when present
        # Apply logit normalization if renormalize_logits flag is True
        if generation_config.renormalize_logits is True:
            warpers.append(LogitNormalization())
        
        # Return the list of warpers containing all relevant LogitsWarper instances
        return warpers
    # 获取 logits 处理器函数，根据给定的配置和参数
    def _get_logits_processor(
        self,
        generation_config: GenerationConfig,  # 生成配置对象
        input_ids_seq_length: int,  # 输入的序列长度
        encoder_input_ids: torch.LongTensor,  # 编码器输入的张量
        prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],  # 可以使用的前缀令牌函数
        logits_processor: Optional[LogitsProcessorList],  # logits 处理器的可选列表
        model_kwargs: Optional[Dict[str, Any]] = None,  # 模型参数的可选字典，默认为空
        negative_prompt_ids: Optional[torch.Tensor] = None,  # 负面提示的可选张量，默认为空
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,  # 负面提示的注意力掩码，可选，默认为空
    ):
        # 定义 stopping_criteria 对象并初始化为空列表
        criteria = StoppingCriteriaList()
        
        # 如果生成配置中指定了最大长度
        if generation_config.max_length is not None:
            # 从模型配置中获取最大位置嵌入数
            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
            # 向 criteria 中添加最大长度的停止条件
            criteria.append(
                MaxLengthCriteria(
                    max_length=generation_config.max_length,
                    max_position_embeddings=max_position_embeddings,
                )
            )
        
        # 如果生成配置中指定了最大时间
        if generation_config.max_time is not None:
            # 向 criteria 中添加最大时间的停止条件
            criteria.append(MaxTimeCriteria(max_time=generation_config.max_time))
        
        # 将自定义的 stopping_criteria 合并到 criteria 中
        criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
        
        # 返回最终的 criteria 列表
        return criteria

    # 合并默认列表和自定义列表的 logits 处理器或停止条件
    def _merge_criteria_processor_list(
        self,
        default_list: Union[LogitsProcessorList, StoppingCriteriaList],  # 默认的处理器或停止条件列表
        custom_list: Union[LogitsProcessorList, StoppingCriteriaList],  # 自定义的处理器或停止条件列表
    ) -> Union[LogitsProcessorList, StoppingCriteriaList]:  # 返回合并后的处理器或停止条件列表
        # 如果自定义列表为空，直接返回默认列表
        if len(custom_list) == 0:
            return default_list
        
        # 遍历默认列表
        for default in default_list:
            # 遍历自定义列表
            for custom in custom_list:
                # 如果自定义的对象类型和默认的对象类型相同
                if type(custom) is type(default):
                    # 确定对象类型是停止条件还是 logits 处理器
                    object_type = "stopping criteria" if isinstance(custom, StoppingCriteria) else "logits processor"
                    # 抛出值错误，提示不允许自定义与默认相同类型的处理器或条件
                    raise ValueError(
                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
                        f" `.generate()`, but it has already been created with the values {default}. {default} has been"
                        " created by passing the corresponding arguments to generate or by the model's config default"
                        f" values. If you just want to change the default values of {object_type} consider passing"
                        f" them as arguments to `.generate()` instead of using a custom {object_type}."
                    )
        
        # 将自定义列表的内容扩展到默认列表中
        default_list.extend(custom_list)
        
        # 返回合并后的默认列表
        return default_list

    # 计算转移分数的函数
    def compute_transition_scores(
        self,
        sequences: torch.Tensor,  # 序列张量
        scores: Tuple[torch.Tensor],  # 分数元组
        beam_indices: Optional[torch.Tensor] = None,  # 光束索引的可选张量，默认为空
        normalize_logits: bool = False,  # 是否对 logits 进行归一化，默认为 False
    def _validate_model_class(self):
        """
        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
        right class to use.
        """
        # 检查当前模型是否能够生成文本
        if not self.can_generate():
            # 可生成的模型映射列表
            generate_compatible_mappings = [
                MODEL_FOR_CAUSAL_LM_MAPPING,
                MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
                MODEL_FOR_VISION_2_SEQ_MAPPING,
                MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
                MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
            ]
            generate_compatible_classes = set()
            # 遍历可生成的模型映射列表，获取支持的模型类名集合
            for model_mapping in generate_compatible_mappings:
                supported_models = model_mapping.get(type(self.config), default=None)
                if supported_models is not None:
                    generate_compatible_classes.add(supported_models.__name__)
            # 出现异常的错误信息
            exception_message = (
                f"The current model class ({self.__class__.__name__}) is not compatible with `.generate()`, as "
                "it doesn't have a language model head."
            )
            # 如果存在兼容的模型类名集合，则添加到异常信息中
            if generate_compatible_classes:
                exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
            # 抛出类型错误异常，包含详细的异常信息
            raise TypeError(exception_message)
    # 执行与生成长度相关的验证，包括警告和错误处理

    # 1. 针对参数化不良的最大长度警告
    if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
        # 如果使用了默认的 `max_length`（=20）来控制生成长度，会发出警告
        warnings.warn(
            f"Using the model-agnostic default `max_length` (={generation_config.max_length}) to control the "
            "generation length. We recommend setting `max_new_tokens` to control the maximum length of the "
            "generation.",
            UserWarning,
        )
    
    # 如果输入的ids长度超过了指定的最大长度，会引发异常
    if input_ids_length >= generation_config.max_length:
        input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
        raise ValueError(
            f"Input length of {input_ids_string} is {input_ids_length}, but `max_length` is set to"
            f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
            " increasing `max_length` or, better yet, setting `max_new_tokens`."
        )

    # 2. 由于不可行的参数组合，发出最小长度警告
    min_length_error_suffix = (
        " Generation will stop at the defined maximum length. You should decrease the minimum length and/or "
        "increase the maximum length."
    )
    if has_default_max_length:
        min_length_error_suffix += (
            f" Note that `max_length` is set to {generation_config.max_length}, its default value."
        )
    
    # 如果设定了最小长度，并且该长度大于最大长度，则发出警告
    if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
        warnings.warn(
            f"Unfeasible length constraints: `min_length` ({generation_config.min_length}) is larger than"
            f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
            UserWarning,
        )
    
    # 如果设置了最小新token数量，并且计算后的最小长度超过了最大长度，则发出警告
    if generation_config.min_new_tokens is not None:
        min_length = generation_config.min_new_tokens + input_ids_length
        if min_length > generation_config.max_length:
            warnings.warn(
                f"Unfeasible length constraints: `min_new_tokens` ({generation_config.min_new_tokens}), when "
                f"added to the prompt length ({input_ids_length}), is larger than"
                f" the maximum possible length ({generation_config.max_length})." + min_length_error_suffix,
                UserWarning,
            )
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
        generation_config: Optional[GenerationConfig] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
        synced_gpus: Optional[bool] = None,
        assistant_model: Optional["PreTrainedModel"] = None,
        streamer: Optional["BaseStreamer"] = None,
        negative_prompt_ids: Optional[torch.Tensor] = None,
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ):
        """
        Generates sequences based on the provided inputs and configuration.

        Args:
            inputs (Optional[torch.Tensor]): Input tensor for generation.
            generation_config (Optional[GenerationConfig]): Configuration for generation.
            logits_processor (Optional[LogitsProcessorList]): Processors for logits during generation.
            stopping_criteria (Optional[StoppingCriteriaList]): Criteria for stopping generation.
            prefix_allowed_tokens_fn (Optional[Callable[[int, torch.Tensor], List[int]]]): Function to allow tokens during generation.
            synced_gpus (Optional[bool]): Whether to synchronize generation across GPUs.
            assistant_model (Optional["PreTrainedModel"]): Model used for generation assistance.
            streamer (Optional["BaseStreamer"]): Streamer for generation.
            negative_prompt_ids (Optional[torch.Tensor]): IDs for negative prompts.
            negative_prompt_attention_mask (Optional[torch.Tensor]): Attention mask for negative prompts.
            **kwargs: Additional keyword arguments.

        Returns:
            dict: Dictionary containing generated sequences and other relevant outputs.
        """
        ...

    def _has_unfinished_sequences(self, this_peer_finished: bool, synced_gpus: bool, device: torch.device) -> bool:
        """
        Returns whether there are still unfinished sequences on the specified device.

        Args:
            this_peer_finished (bool): Flag indicating if the current peer has finished generation.
            synced_gpus (bool): Whether generation is synchronized across GPUs.
            device (torch.device): Device on which generation is performed.

        Returns:
            bool: True if there are unfinished sequences, False otherwise.
        """
        if synced_gpus:
            # Under synced_gpus, ensure all GPUs complete their sequence generation.
            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(device)
            # Send 0.0 if this peer finished, 1.0 otherwise
            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
            # Check if all peers finished (sum should be 0.0 if all finished)
            if this_peer_finished_flag.item() == 0.0:
                return False
        elif this_peer_finished:
            return False
        return True

    def contrastive_search(self, *args, **kwargs):
        """
        Deprecated method for performing contrastive search. Use `generate` or a custom generation loop instead.

        Args:
            *args: Positional arguments passed to `_contrastive_search`.
            **kwargs: Keyword arguments passed to `_contrastive_search`.

        Returns:
            Any: Result from `_contrastive_search`.
        """
        logger.warning_once(
            "Calling `contrastive_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        return self._contrastive_search(*args, **kwargs)

    @torch.no_grad()
    def _contrastive_search(
        self,
        input_ids: torch.LongTensor,
        top_k: Optional[int] = 1,
        penalty_alpha: Optional[float] = 0,
        logits_processor: Optional[LogitsProcessorList] = None,
        logits_warper: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        streamer: Optional["BaseStreamer"] = None,
        sequential: Optional[bool] = None,
        **model_kwargs,
    ):
        """
        Performs contrastive search to generate sequences based on the input_ids and additional arguments.

        Args:
            input_ids (torch.LongTensor): Input tensor containing token IDs.
            top_k (Optional[int]): Number of top-k results to consider.
            penalty_alpha (Optional[float]): Penalty factor for contrastive search.
            logits_processor (Optional[LogitsProcessorList]): Processors for logits during contrastive search.
            logits_warper (Optional[LogitsProcessorList]): Processors for logits warping during contrastive search.
            stopping_criteria (Optional[StoppingCriteriaList]): Criteria for stopping contrastive search.
            pad_token_id (Optional[int]): Token ID for padding.
            eos_token_id (Optional[Union[int, List[int]]]): Token ID(s) for end-of-sequence.
            output_attentions (Optional[bool]): Whether to output attention weights.
            output_hidden_states (Optional[bool]): Whether to output hidden states.
            output_scores (Optional[bool]): Whether to output scores.
            output_logits (Optional[bool]): Whether to output logits.
            return_dict_in_generate (Optional[bool]): Whether to return results in a dictionary format.
            synced_gpus (bool): Whether generation is synchronized across GPUs.
            streamer (Optional["BaseStreamer"]): Streamer for contrastive search.
            sequential (Optional[bool]): Whether to generate sequentially.
            **model_kwargs: Additional keyword arguments.

        Returns:
            Any: Result of contrastive search, typically sequences or generated outputs.
        """
        ...
    # 发出警告日志，提醒直接调用该方法已经被废弃，将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环代替。
    def greedy_search(self, *args, **kwargs):
        logger.warning_once(
            "Calling `greedy_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        # 调用 `_greedy_search` 方法，并将所有传入的位置参数和关键字参数传递给它
        return self._greedy_search(*args, **kwargs)

    # 发出警告日志，提醒直接调用该方法已经被废弃，将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环代替。
    def _greedy_search(
        self,
        input_ids: torch.LongTensor,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        streamer: Optional["BaseStreamer"] = None,
        **model_kwargs,
    ):
        # 方法实现略去，用于执行贪婪搜索算法或相关任务
        pass

    # 发出警告日志，提醒直接调用该方法已经被废弃，将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环代替。
    def sample(self, *args, **kwargs):
        logger.warning_once(
            "Calling `sample` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        # 调用 `_sample` 方法，并将所有传入的位置参数和关键字参数传递给它
        return self._sample(*args, **kwargs)

    # 发出警告日志，提醒直接调用该方法已经被废弃，将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环代替。
    def _sample(
        self,
        input_ids: torch.LongTensor,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        logits_warper: Optional[LogitsProcessorList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        streamer: Optional["BaseStreamer"] = None,
        **model_kwargs,
    ):
        # 方法实现略去，用于执行采样或相关生成任务
        pass
    def _temporary_reorder_cache(self, past_key_values, beam_idx):
        """
        Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.

        TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
        for this function, with `Cache.reorder_cache` being the sole remaining code path
        """
        # 获取当前类名的小写形式
        model_class = self.__class__.__name__.lower()
        
        # 异常情况1：处理使用传统缓存格式的模型的代码路径
        if isinstance(past_key_values, (tuple, list)):
            past_key_values = self._reorder_cache(past_key_values, beam_idx)
        
        # 异常情况2：处理具有不同缓存格式的模型。这些模型目前仅限于 `DynamicCache`，直到它们的缓存格式标准化为止。
        elif "bloom" in model_class or "gptbigcode" in model_class:
            if not isinstance(past_key_values, DynamicCache):
                raise ValueError(
                    f"Using an unsupported cache format with {model_class}. Currently, it only supports the "
                    "legacy tuple format or `DynamicCache`"
                )
            past_key_values = self._reorder_cache(past_key_values, beam_idx)
            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
        
        # 标准代码路径：使用 `Cache.reorder_cache`
        else:
            past_key_values.reorder_cache(beam_idx)
        
        return past_key_values

    def beam_search(self, *args, **kwargs):
        logger.warning_once(
            "Calling `beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        return self._beam_search(*args, **kwargs)

    def _beam_search(
        self,
        input_ids: torch.LongTensor,
        beam_scorer: BeamScorer,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        sequential: Optional[bool] = None,
        **model_kwargs,
    ):
        """
        Perform beam search to generate sequences based on input_ids and beam_scorer.
        """
        logger.warning_once(
            "Calling `beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        return self._beam_search(*args, **kwargs)

    def beam_sample(self, *args, **kwargs):
        logger.warning_once(
            "Calling `beam_sample` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        return self._beam_sample(*args, **kwargs)
    # 定义一个私有方法 `_beam_sample`，用于执行束搜索采样
    def _beam_sample(
        self,
        input_ids: torch.LongTensor,
        beam_scorer: BeamScorer,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        logits_warper: Optional[LogitsProcessorList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        **model_kwargs,
    ):
        # 具体功能的注释可以在方法内部详细描述
        pass

    # 警告用户 `group_beam_search` 方法即将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环
    def group_beam_search(self, *args, **kwargs):
        logger.warning_once(
            "Calling `group_beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        # 调用 `_group_beam_search` 方法来执行实际的束搜索操作
        return self._group_beam_search(*args, **kwargs)

    # 定义一个私有方法 `_group_beam_search`，用于执行束搜索
    def _group_beam_search(
        self,
        input_ids: torch.LongTensor,
        beam_scorer: BeamScorer,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: bool = False,
        **model_kwargs,
    ):
        # 具体功能的注释可以在方法内部详细描述
        pass

    # 警告用户 `constrained_beam_search` 方法即将在 v4.41 版本中移除，建议使用 `generate` 方法或自定义生成循环
    def constrained_beam_search(self, *args, **kwargs):
        logger.warning_once(
            "Calling `constrained_beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
            "custom generation loop instead.",
        )
        # 调用 `_constrained_beam_search` 方法来执行实际的约束束搜索操作
        return self._constrained_beam_search(*args, **kwargs)

    # 定义一个私有方法 `_constrained_beam_search`，用于执行约束束搜索
    def _constrained_beam_search(
        self,
        input_ids: torch.LongTensor,
        constrained_beam_scorer: ConstrainedBeamSearchScorer,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[Union[int, List[int]]] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ):
        # 具体功能的注释可以在方法内部详细描述
        pass
    # 发出警告日志，提醒直接调用 `_assisted_decoding` 方法已不推荐，在 v4.41 版本中将被移除。建议使用 `generate` 方法或自定义生成循环。
    logger.warning_once(
        "Calling `_assisted_decoding` directly is deprecated and will be removed in v4.41. Use `generate` or a "
        "custom generation loop instead.",
    )
    # 调用 `_assisted_decoding` 方法，将所有传入的位置参数和关键字参数传递给它，并返回其结果。
    return self._assisted_decoding(*args, **kwargs)
def _speculative_sampling(
    candidate_input_ids,
    candidate_logits,
    candidate_length,
    new_logits,
    last_assistant_token_is_eos,
    max_matches,
):
    """
    Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
    the selected tokens, as well as the number of candidate matches.

    NOTE: Unless otherwise stated, the variable names match those in the paper.
    """
    # Selects the last `candidate_length` tokens from `candidate_input_ids`
    new_candidate_input_ids = candidate_input_ids[:, -candidate_length:]

    # Converts logits to probabilities and extracts assistant (q_i) and model (p_i) probabilities for selected tokens
    q = candidate_logits.softmax(dim=-1)
    q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
    p = new_logits.softmax(dim=-1)
    p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1)
    probability_ratio = p_i / q_i

    # Determines which tokens to accept based on probability ratios
    r_i = torch.rand_like(probability_ratio)
    is_accepted = r_i <= probability_ratio

    # Computes the number of accepted tokens (`n_matches` in algorithm 1)
    n_matches = ((~is_accepted).cumsum(dim=-1) < 1).sum()

    # Ensures the generated sequence does not exceed `max_matches` or end with an EOS token
    if last_assistant_token_is_eos and n_matches == candidate_length:
        # Adjusts `n_matches` if the sequence ends with an EOS token
        n_matches -= 1
        valid_tokens = new_candidate_input_ids[:, : n_matches + 1]
    else:
        n_matches = min(n_matches, max_matches)

        # Selects the next token considering rejection and adjusts probabilities if needed
        gamma = min(candidate_logits.shape[1], max_matches)
        p_n_plus_1 = p[:, n_matches, :]
        if n_matches < gamma:
            q_n_plus_1 = q[:, n_matches, :]
            p_prime = torch.clamp((p_n_plus_1 - q_n_plus_1), min=0)
            p_prime.div_(p_prime.sum())
        else:
            p_prime = p_n_plus_1
        t = torch.multinomial(p_prime, num_samples=1).squeeze(1)[None, :]

        # Constructs the final sequence of valid tokens
        if n_matches > 0:
            valid_tokens = torch.cat((new_candidate_input_ids[:, :n_matches], t), dim=-1)
        else:
            valid_tokens = t

    return valid_tokens, n_matches
    # 给定多个生成的标记的解码器注意力或隐藏状态，将其拆分成一个元组，其中每个成员对应于单个生成的标记。
    """
    # 兼容性调整：在我们的生成函数中，第一次迭代包含了关于提示的注意力/隐藏状态。
    if len(outputs) == 0:
        # 初始化一个空的元组
        new_tuple = ()
        # 遍历新输出的每一层
        for layer in new_outputs:
            # 如果是解码器的注意力，使用当前长度和最后一维的大小；否则使用整个层的大小
            last_dim_size = cur_len if is_decoder_attention else layer.shape[-1]
            # 将当前层的片段添加到新元组中
            new_tuple += (layer[..., :cur_len, :last_dim_size],)
        # 将新元组作为一个元素添加到输出元组中
        outputs += (new_tuple,)
        # 更新当前长度变量，因为第一次迭代包含了提示 + 1个生成的标记
        cur_len += 1
        # 更新添加的长度变量
        added_len -= cur_len
    
    # 对于每个额外添加的长度
    for i in range(added_len):
        # 初始化一个空的元组
        new_tuple = ()
        # 遍历新输出的每一层
        for layer in new_outputs:
            # 如果是解码器的注意力，使用当前长度加上i和最后一维的大小；否则使用整个层的大小
            last_dim_size = cur_len + i if is_decoder_attention else layer.shape[-1]
            # 将当前层的片段添加到新元组中
            new_tuple += (layer[..., i : i + 1, :last_dim_size],)
        # 将新元组作为一个元素添加到输出元组中
        outputs += (new_tuple,)
    # 返回输出元组
    return outputs
# 根据上下文隐藏状态的每个向量的L2范数归一化，使其长度为1，以便计算余弦相似度
norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)

# 根据下一个隐藏状态的每个向量的L2范数归一化，使其长度为1，以便计算余弦相似度
norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)

# 计算上下文隐藏状态与下一个隐藏状态之间的余弦相似度矩阵，将维度调整为[B*K, S]
cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1, 2)).squeeze(-1)

# 在余弦相似度矩阵的最后一个维度上取最大值，得到每个样本的最大相似度，形状为[B*K]
degeneration_penalty, _ = torch.max(cosine_matrix, dim=-1)

# 将下一个顶部K个候选项的概率视图调整为一维数组，形状为[B*K]
next_top_k_probs = next_top_k_probs.view(-1)

# 计算对比分数，根据论文中的对比框架计算每个候选项的分数
contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty

# 将对比分数按照beam_width分割，形状调整为[B, K]的张量
contrastive_score = torch.stack(torch.split(contrastive_score, beam_width))

# 在每行中选择最高分数对应的索引，形状为[B]
_, selected_idx = contrastive_score.max(dim=-1)

# 返回每个批次中最佳候选项的索引
return selected_idx



# 处理数据分割的函数，根据数据类型分别处理不同情况的数据分割
def _split(data, full_batch_size: int, split_size: int = None):
    if data is None:
        # 如果数据为None，则返回与分割大小对应的None列表
        return [None] * (full_batch_size // split_size)
    if isinstance(data, torch.Tensor):
        # 如果数据为Tensor，则按照分割大小分割Tensor，返回Tensor列表
        return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
    elif isinstance(data, tuple):
        # 如果数据为元组，根据元组中元素的类型进行不同的分割处理
        if isinstance(data[0], tuple):
            # 如果元组中的元素也是元组，则按照分割大小分割每个元组中的Tensor，返回元组列表的元组列表
            return [
                tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data)
                for i in range(0, full_batch_size, split_size)
            ]
        else:
            # 如果元组中的元素不是元组，则按照分割大小分割每个Tensor，返回元组列表
            return [
                tuple(sub_tensor[i : i + split_size] for sub_tensor in data)
                for i in range(0, full_batch_size, split_size)
            ]
    else:
        # 如果数据类型不符合预期，则引发值错误异常
        raise ValueError(f"Unexpected attribute type: {type(data)}")



# 将模型输入（可能是ModelOutput或Dict类型）按照指定的分割大小拆分成相同类型的对象列表
def _split_model_inputs(
    model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int
) -> List[Union[ModelOutput, Dict]]:
    """
    Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
    size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
    previous forward pass.
    """
    # 如果 model_input 为 None，则返回一个 Nones 列表
    # 在 Whisper 中，encoder_outputs 为 None 时会发生这种情况
    if model_input is None:
        return [model_input] * (full_batch_size // split_size)
    # 从对象中推断出类
    model_output_cls = type(model_input)
    if (full_batch_size % split_size) != 0:
        # 如果 full_batch_size 不能被 split_size 整除，则引发 ValueError
        raise ValueError("`full_batch_size` must be divisible by `split_size`")

    if split_size > full_batch_size:
        # 如果 split_size 大于 full_batch_size，则引发 ValueError
        raise ValueError("`split_size` must be smaller or equal to `full_batch_size`")

    # 用于拆分张量或张量的元组的辅助函数

    # 查找所有数据类字段（例如，last_hidden_state，pooler_output 等），并对它们进行拆分
    keys = (
        model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys()
    )
    # 仅保留在 model_input 中的键
    keys = [k for k in keys if k in model_input]
    # 在这里，我们可以有四种类型的值：张量、张量的元组和布尔值，以及 encoder_outputs，后者是一个 ModelOutput 对象。
    # 布尔值不应该被拆分，而应该为每个拆分复制
    bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"]
    keys_to_ignore = ["cache_position", "encoder_outputs"]
    non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]

    # 拆分张量和张量的元组
    data_split_list = [
        {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
        for i in range(full_batch_size // split_size)
    ]
    # 布尔值是相同的，每个拆分中都会复制
    bool_data = {k: model_input[k] for k in bool_keys}
    # encoder_outputs 是一个 ModelOutput 对象，应该单独拆分
    if "encoder_outputs" in model_input:
        encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size)
        data_split_list = [
            {**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list)
        ]

    # 将列表中的每个字典转换为推断类的对象
    split_model_inputs: List[Union[ModelOutput, Dict]] = [
        model_output_cls(**data_split, **bool_data) for data_split in data_split_list
    ]

    return split_model_inputs
# 将给定的 ModelOutput 对象列表沿着 batch_size 维度堆叠起来。该函数推断出列表中的具体 ModelOutput 子类。
def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput:
    """
    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
    specific ModelOutput subclass from the list provided.
    """
    # 如果输入的列表为空，则抛出数值错误
    if not model_outputs:
        raise ValueError("Input list is empty.")

    # 推断出列表中第一个对象的类
    model_output_cls = type(model_outputs[0])

    # 确保所有对象都是同一类型
    if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
        raise ValueError("All elements in the list should be of the same type.")

    # 辅助函数，用于连接张量或张量元组
    def _concat(data):
        """
        Reverse of `_split` function above.
        """
        # 如果数据中任意元素为 None，则返回 None
        if any(data is None for data in data):
            return None
        # 如果第一个元素是 torch.Tensor
        if isinstance(data[0], torch.Tensor):
            # 沿着 dim=0 连接所有张量
            return torch.cat(data, dim=0)
        # 如果第一个元素是元组
        elif isinstance(data[0], tuple):
            # 如果元组的元素也是元组（例如我们之前的示例中的 past_key_values）
            if isinstance(data[0][0], tuple):
                # 对每个元组的每个元素，沿着 dim=0 连接所有张量
                return tuple(
                    tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0])))
                    for i in range(len(data[0]))
                )
            else:
                # 否则，对元组中的每个元素，沿着 dim=0 连接所有张量
                return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0])))
        # 如果第一个元素是整数或浮点数，返回一个张量
        elif isinstance(data[0], (int, float)):
            return torch.tensor(data)
        else:
            # 抛出数值错误，显示意外的属性类型
            raise ValueError(f"Unexpected attribute type: {type(data[0])}")

    # 使用字典推导式，从所有对象中收集属性并连接它们
    concatenated_data = {
        # 对于每个属性 k，在所有模型输出对象中，获取属性 k 的值并连接它们
        k: _concat([getattr(model_output, k) for model_output in model_outputs])
        for k in model_output_cls.__dataclass_fields__.keys()
    }

    # 返回一个新的推断类对象，其中包含连接后的属性
    return model_output_cls(**concatenated_data)