|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from dataclasses import dataclass, field |
|
|
from typing import Literal, Optional, Union |
|
|
|
|
|
from peft.config import PeftConfig |
|
|
from peft.utils import PeftType |
|
|
|
|
|
|
|
|
RoadVariant = Literal["road_1", "road_2", "road_4"] |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class RoadConfig(PeftConfig): |
|
|
""" |
|
|
This is the configuration class to store the configuration of a [`RoadModel`]. RoAd adapter is proposed in |
|
|
https://arxiv.org/pdf/2409.00119. |
|
|
|
|
|
Args: |
|
|
variant (Union[`RoadVariant`, `str`]): |
|
|
The variant of the Road model to use. It can be one of road_1, road_2, or road_4. Refer to the paper for |
|
|
more details. |
|
|
- road_1: Uses the same scale and angle for all pairs of elements. |
|
|
This variant has lowest number of parameters, it stores a number equal to the output hidden size of |
|
|
parameters for each layer that RoAd is applied to. |
|
|
- road_2: Uses the same scale and angle for each element. |
|
|
This variant has 2x the number of parameters compared to road_1. |
|
|
- road_4: Uses two different scales and angles for each ellement. |
|
|
This variant has 4x the number of parameters compared to road_1. |
|
|
group_size (`int`): |
|
|
Group size defines how elements are grouped together into 2D vectors for rotation. Within each group |
|
|
element 0 is paired with element group_size/2, then element 1 is paired with element group_size/2+1 and so |
|
|
on. This has no effect on the model performance, since elements are unordered, however it has some effect |
|
|
on inference speed when used in e.g. VLLM. For best speed group size of at least 32 or 64 (the default) is |
|
|
recommended. Note that model hidden size (or hidden size per partition when used with tensor parallelism) |
|
|
must be divisible by group_size, so for very small models you might need to reduce this parameter. |
|
|
init_weights (`bool`): |
|
|
Whether to perform initialization of RoAd weights. |
|
|
target_modules (`Optional[Union[List[str], str]]`): |
|
|
The names of the modules to apply the adapter to. If this is specified, only the modules with the specified |
|
|
names will be replaced. When passing a string, a regex match will be performed. When passing a list of |
|
|
strings, either an exact match will be performed or it is checked if the name of the module ends with any |
|
|
of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen (if |
|
|
the model is a PreTrainedModel, the output layer excluded). If this is not specified, modules will be |
|
|
chosen according to the model architecture. If the architecture is not known, an error will be raised -- in |
|
|
this case, you should specify the target modules manually. |
|
|
modules_to_save (`List[str]`): |
|
|
List of modules apart from Road layers to be set as trainable and saved in the final checkpoint. |
|
|
""" |
|
|
|
|
|
variant: Union[str, RoadVariant] = field( |
|
|
default="road_1", |
|
|
metadata={"help": ("Variant of the Road model to use.")}, |
|
|
) |
|
|
group_size: int = field( |
|
|
default=64, |
|
|
metadata={ |
|
|
"help": ( |
|
|
"Group size defines how elements are grouped together into 2D vectors for rotation. " |
|
|
"Within each group element 0 is paired with element group_size/2, " |
|
|
"then element 1 is paired with element group_size/2+1 and so on. " |
|
|
"This has no effect on the model performance, since elements are unordered, " |
|
|
"however it has some effect on inference speed when used in e.g. VLLM. " |
|
|
"For best speed group size of at least 64 is recommended. " |
|
|
"Note that model hidden size (or hidden size per partition when used with tensor parallelism) " |
|
|
"must be divisible by group_size, so for very small models you might need to reduce this parameter." |
|
|
) |
|
|
}, |
|
|
) |
|
|
init_weights: bool = field( |
|
|
default=True, |
|
|
metadata={ |
|
|
"help": ( |
|
|
"Whether to initialize the weights of the RoAd layers with their default initialization. Don't change " |
|
|
"this setting, except if you know exactly what you're doing." |
|
|
), |
|
|
}, |
|
|
) |
|
|
target_modules: Optional[Union[list[str], str]] = field( |
|
|
default=None, |
|
|
metadata={ |
|
|
"help": ( |
|
|
"List of module names or regex expression of the module names to replace with Road." |
|
|
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." |
|
|
"This can also be a wildcard 'all-linear' which matches all linear/Conv1D " |
|
|
"(if the model is a PreTrainedModel, the output layer excluded)." |
|
|
"If not specified, modules will be chosen according to the model architecture, If the architecture is " |
|
|
"not known, an error will be raised -- in this case, you should specify the target modules manually." |
|
|
), |
|
|
}, |
|
|
) |
|
|
modules_to_save: Optional[list[str]] = field( |
|
|
default=None, |
|
|
metadata={ |
|
|
"help": ( |
|
|
"List of modules apart from RoAd layers to be set as trainable and saved in the final checkpoint. For" |
|
|
" example, in Sequence Classification or Token Classification tasks, the final layer" |
|
|
" `classifier/score` are randomly initialized and as such need to be trainable and saved." |
|
|
) |
|
|
}, |
|
|
) |
|
|
|
|
|
def __post_init__(self): |
|
|
super().__post_init__() |
|
|
self.peft_type = PeftType.ROAD |
|
|
self.target_modules = ( |
|
|
set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules |
|
|
) |
|
|
if self.variant not in ["road_1", "road_2", "road_4"]: |
|
|
raise ValueError(f"Invalid variant {self.variant} specified. Please choose from road_1, road_2 or road_4") |
|
|
if self.group_size <= 0 or self.group_size % 2 != 0: |
|
|
raise ValueError(f"The group_size must be divisible by 2 when using RoadLayer, but got {self.group_size}.") |
|
|
|