Skip to content

Commit 15357cd

Browse files
authored
[breaking] support transformers 4.48 (hiyouga#6628)
1 parent 45e68b9 commit 15357cd

File tree

17 files changed

+53
-105
lines changed

17 files changed

+53
-105
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ jobs:
2222
fail-fast: false
2323
matrix:
2424
python-version:
25-
- "3.8" # TODO: remove py38 in next transformers release
2625
- "3.9"
2726
- "3.10"
2827
- "3.11"
28+
- "3.12"
2929
os:
3030
- "ubuntu-latest"
3131
- "windows-latest"

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -377,11 +377,11 @@ huggingface-cli login
377377

378378
| Mandatory | Minimum | Recommend |
379379
| ------------ | ------- | --------- |
380-
| python | 3.8 | 3.11 |
380+
| python | 3.9 | 3.10 |
381381
| torch | 1.13.1 | 2.4.0 |
382-
| transformers | 4.41.2 | 4.43.4 |
383-
| datasets | 2.16.0 | 2.20.0 |
384-
| accelerate | 0.30.1 | 0.32.0 |
382+
| transformers | 4.41.2 | 4.45.2 |
383+
| datasets | 2.16.0 | 3.2.0 |
384+
| accelerate | 0.34.0 | 1.2.1 |
385385
| peft | 0.11.1 | 0.12.0 |
386386
| trl | 0.8.6 | 0.9.6 |
387387

@@ -390,8 +390,8 @@ huggingface-cli login
390390
| CUDA | 11.6 | 12.2 |
391391
| deepspeed | 0.10.0 | 0.14.0 |
392392
| bitsandbytes | 0.39.0 | 0.43.1 |
393-
| vllm | 0.4.3 | 0.5.0 |
394-
| flash-attn | 2.3.0 | 2.6.3 |
393+
| vllm | 0.4.3 | 0.6.6 |
394+
| flash-attn | 2.3.0 | 2.7.2 |
395395

396396
### Hardware Requirement
397397

README_zh.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -379,11 +379,11 @@ huggingface-cli login
379379

380380
| 必需项 | 至少 | 推荐 |
381381
| ------------ | ------- | --------- |
382-
| python | 3.8 | 3.11 |
382+
| python | 3.9 | 3.10 |
383383
| torch | 1.13.1 | 2.4.0 |
384-
| transformers | 4.41.2 | 4.43.4 |
385-
| datasets | 2.16.0 | 2.20.0 |
386-
| accelerate | 0.30.1 | 0.32.0 |
384+
| transformers | 4.41.2 | 4.45.2 |
385+
| datasets | 2.16.0 | 3.2.0 |
386+
| accelerate | 0.34.0 | 1.2.1 |
387387
| peft | 0.11.1 | 0.12.0 |
388388
| trl | 0.8.6 | 0.9.6 |
389389

@@ -392,8 +392,8 @@ huggingface-cli login
392392
| CUDA | 11.6 | 12.2 |
393393
| deepspeed | 0.10.0 | 0.14.0 |
394394
| bitsandbytes | 0.39.0 | 0.43.1 |
395-
| vllm | 0.4.3 | 0.5.0 |
396-
| flash-attn | 2.3.0 | 2.6.3 |
395+
| vllm | 0.4.3 | 0.6.6 |
396+
| flash-attn | 2.3.0 | 2.7.2 |
397397

398398
### 硬件依赖
399399

requirements.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
transformers>=4.41.2,<=4.46.1
2-
datasets>=2.16.0,<=3.1.0
3-
accelerate>=0.34.0,<=1.0.1
1+
transformers>=4.41.2,<=4.45.2;python_version<'3.10'
2+
transformers>=4.41.2,<=4.48.1,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10'
3+
datasets>=2.16.0,<=3.2.0
4+
accelerate>=0.34.0,<=1.2.1
45
peft>=0.11.1,<=0.12.0
56
trl>=0.8.6,<=0.9.6
6-
tokenizers>=0.19.0,<0.20.4
7+
tokenizers>=0.19.0,<=0.21.0
78
gradio>=4.38.0,<=5.12.0
89
pandas>=2.0.0
910
scipy

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def get_console_scripts() -> List[str]:
4646
"torch": ["torch>=1.13.1"],
4747
"torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
4848
"metrics": ["nltk", "jieba", "rouge-chinese"],
49-
"deepspeed": ["deepspeed>=0.10.0,<=0.14.4"],
49+
"deepspeed": ["deepspeed>=0.10.0,<=0.16.2"],
5050
"liger-kernel": ["liger-kernel"],
5151
"bitsandbytes": ["bitsandbytes>=0.39.0"],
5252
"hqq": ["hqq"],
@@ -92,7 +92,7 @@ def main():
9292
url="https://github.com/hiyouga/LLaMA-Factory",
9393
package_dir={"": "src"},
9494
packages=find_packages("src"),
95-
python_requires=">=3.8.0",
95+
python_requires=">=3.9.0",
9696
install_requires=get_requires(),
9797
extras_require=extra_require,
9898
entry_points={"console_scripts": get_console_scripts()},
@@ -104,10 +104,10 @@ def main():
104104
"License :: OSI Approved :: Apache Software License",
105105
"Operating System :: OS Independent",
106106
"Programming Language :: Python :: 3",
107-
"Programming Language :: Python :: 3.8",
108107
"Programming Language :: Python :: 3.9",
109108
"Programming Language :: Python :: 3.10",
110109
"Programming Language :: Python :: 3.11",
110+
"Programming Language :: Python :: 3.12",
111111
"Topic :: Scientific/Engineering :: Artificial Intelligence",
112112
],
113113
)

src/llamafactory/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,17 @@
2020
2121
Dependency graph:
2222
main:
23-
transformers>=4.41.2,<=4.46.1
24-
datasets>=2.16.0,<=3.1.0
25-
accelerate>=0.34.0,<=1.0.1
23+
transformers>=4.41.2,<=4.48.1,!=4.46.*,!=4.47.*,!=4.48.0
24+
datasets>=2.16.0,<=3.2.0
25+
accelerate>=0.34.0,<=1.2.1
2626
peft>=0.11.1,<=0.12.0
2727
trl>=0.8.6,<=0.9.6
2828
attention:
2929
transformers>=4.42.4 (gemma+fa2)
3030
longlora:
31-
transformers>=4.41.2,<=4.46.1
31+
transformers>=4.41.2,<4.48.0
3232
packing:
33-
transformers>=4.43.0,<=4.46.1
33+
transformers>=4.43.0,<=4.48.1
3434
3535
Disable version checking: DISABLE_VERSION_CHECK=1
3636
Enable VRAM recording: RECORD_VRAM=1

src/llamafactory/extras/misc.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from transformers.utils.versions import require_version
3535

3636
from . import logging
37+
from .packages import is_transformers_version_greater_than
3738

3839

3940
_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
@@ -93,11 +94,13 @@ def check_dependencies() -> None:
9394
r"""
9495
Checks the version of the required packages.
9596
"""
96-
check_version("transformers>=4.41.2,<=4.46.1")
97-
check_version("datasets>=2.16.0,<=3.1.0")
98-
check_version("accelerate>=0.34.0,<=1.0.1")
97+
check_version("transformers>=4.41.2,<=4.48.1,!=4.46.0,!=4.46.1,!=4.46.2,!=4.46.3,!=4.47.0,!=4.47.1,!=4.48.0")
98+
check_version("datasets>=2.16.0,<=3.2.0")
99+
check_version("accelerate>=0.34.0,<=1.2.1")
99100
check_version("peft>=0.11.1,<=0.12.0")
100101
check_version("trl>=0.8.6,<=0.9.6")
102+
if is_transformers_version_greater_than("4.46.0") and not is_transformers_version_greater_than("4.48.1"):
103+
logger.warning_rank0_once("There are known bugs in transformers v4.46.0-v4.48.0, please use other versions.")
101104

102105

103106
def calculate_tps(dataset: Sequence[Dict[str, Any]], metrics: Dict[str, float], stage: Literal["sft", "rm"]) -> float:

src/llamafactory/extras/packages.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,6 @@ def is_transformers_version_greater_than(content: str):
8787
return _get_package_version("transformers") >= version.parse(content)
8888

8989

90-
@lru_cache
91-
def is_transformers_version_equal_to_4_46():
92-
return version.parse("4.46.0") <= _get_package_version("transformers") <= version.parse("4.46.1")
93-
94-
9590
def is_uvicorn_available():
9691
return _is_package_available("uvicorn")
9792

src/llamafactory/model/model_utils/longlora.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def shift(state: "torch.Tensor") -> "torch.Tensor":
350350

351351

352352
def _apply_llama_patch() -> None:
353-
check_version("transformers>=4.41.2,<=4.46.1")
353+
check_version("transformers>=4.41.2,<4.48.0")
354354
LlamaAttention.forward = llama_attention_forward
355355
LlamaFlashAttention2.forward = llama_flash_attention_2_forward
356356
LlamaSdpaAttention.forward = llama_sdpa_attention_forward

src/llamafactory/model/model_utils/packing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,6 @@ def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None:
118118
if not is_trainable or not model_args.block_diag_attn:
119119
return
120120

121-
check_version("transformers>=4.43.0,<=4.46.1")
121+
check_version("transformers>=4.43.0,<=4.48.1")
122122
transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
123123
logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.")

src/llamafactory/train/dpo/trainer.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from typing_extensions import override
3030

3131
from ...extras.constants import IGNORE_INDEX
32-
from ...extras.packages import is_transformers_version_equal_to_4_46, is_transformers_version_greater_than
32+
from ...extras.packages import is_transformers_version_greater_than
3333
from ..callbacks import SaveProcessorCallback
3434
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
3535

@@ -282,19 +282,12 @@ def compute_loss(
282282
self, model: "PreTrainedModel", inputs: Dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
283283
) -> Union["torch.Tensor", Tuple["torch.Tensor", List["torch.Tensor"]]]:
284284
r"""
285-
Fixes the loss value. See https://github.com/huggingface/transformers/pull/35438 for details.
285+
Subclass and override to accept extra kwargs.
286286
"""
287-
loss = super().compute_loss(model, inputs, return_outputs)
288-
if is_transformers_version_equal_to_4_46() and kwargs.get("num_items_in_batch"):
289-
if return_outputs:
290-
loss = (loss[0] / self.args.gradient_accumulation_steps, *loss[1:])
291-
else:
292-
loss = loss / self.args.gradient_accumulation_steps
293-
294-
return loss
287+
return super().compute_loss(model, inputs, return_outputs)
295288

296289
@override
297-
def log(self, logs: Dict[str, float]) -> None:
290+
def log(self, logs: Dict[str, float], *args, **kwargs) -> None:
298291
r"""
299292
Log `logs` on the various objects watching training, including stored metrics.
300293
"""
@@ -318,4 +311,4 @@ def log(self, logs: Dict[str, float]) -> None:
318311
if not key.startswith("dummy_"):
319312
logs[key] = metric
320313

321-
return Trainer.log(self, logs)
314+
return Trainer.log(self, logs, *args, **kwargs)

src/llamafactory/train/kto/trainer.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from typing_extensions import override
2929

3030
from ...extras.constants import IGNORE_INDEX
31-
from ...extras.packages import is_transformers_version_equal_to_4_46, is_transformers_version_greater_than
31+
from ...extras.packages import is_transformers_version_greater_than
3232
from ..callbacks import SaveProcessorCallback
3333
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
3434

@@ -256,19 +256,12 @@ def compute_loss(
256256
self, model: "PreTrainedModel", inputs: Dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
257257
) -> Union["torch.Tensor", Tuple["torch.Tensor", List["torch.Tensor"]]]:
258258
r"""
259-
Fixes the loss value. See https://github.com/huggingface/transformers/pull/35438 for details.
259+
Subclass and override to accept extra kwargs.
260260
"""
261-
loss = super().compute_loss(model, inputs, return_outputs)
262-
if is_transformers_version_equal_to_4_46() and kwargs.get("num_items_in_batch"):
263-
if return_outputs:
264-
loss = (loss[0] / self.args.gradient_accumulation_steps, *loss[1:])
265-
else:
266-
loss = loss / self.args.gradient_accumulation_steps
267-
268-
return loss
261+
return super().compute_loss(model, inputs, return_outputs)
269262

270263
@override
271-
def log(self, logs: Dict[str, float]) -> None:
264+
def log(self, logs: Dict[str, float], *args, **kwargs) -> None:
272265
r"""
273266
Log `logs` on the various objects watching training, including stored metrics.
274267
"""
@@ -304,4 +297,4 @@ def log(self, logs: Dict[str, float]) -> None:
304297
if not key.startswith("dummy_"):
305298
logs[key] = metric
306299

307-
return Trainer.log(self, logs)
300+
return Trainer.log(self, logs, *args, **kwargs)

src/llamafactory/train/pt/trainer.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
from types import MethodType
16-
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
16+
from typing import TYPE_CHECKING, Optional
1717

1818
import torch
1919
from transformers import Trainer
@@ -25,7 +25,7 @@
2525

2626

2727
if TYPE_CHECKING:
28-
from transformers import PreTrainedModel, ProcessorMixin
28+
from transformers import ProcessorMixin
2929

3030
from ...hparams import FinetuningArguments
3131

@@ -72,21 +72,3 @@ def _get_train_sampler(self) -> Optional["torch.utils.data.Sampler"]:
7272
return torch.utils.data.SequentialSampler(self.train_dataset)
7373

7474
return super()._get_train_sampler()
75-
76-
@override
77-
def compute_loss(
78-
self, model: "PreTrainedModel", inputs: Dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
79-
) -> Union["torch.Tensor", Tuple["torch.Tensor", List["torch.Tensor"]]]:
80-
r"""
81-
Fixes the loss value. See https://github.com/huggingface/transformers/pull/35438 for details.
82-
83-
It should be removed after https://github.com/huggingface/transformers/pull/35651 is merged.
84-
"""
85-
loss = super().compute_loss(model, inputs, return_outputs, **kwargs)
86-
if kwargs.get("num_items_in_batch") and not getattr(self, "model_accepts_loss_kwargs", False):
87-
if return_outputs:
88-
loss = (loss[0] / self.args.gradient_accumulation_steps, *loss[1:])
89-
else:
90-
loss = loss / self.args.gradient_accumulation_steps
91-
92-
return loss

src/llamafactory/train/rm/trainer.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from typing_extensions import override
2626

2727
from ...extras import logging
28-
from ...extras.packages import is_transformers_version_equal_to_4_46, is_transformers_version_greater_than
28+
from ...extras.packages import is_transformers_version_greater_than
2929
from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
3030
from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
3131

@@ -107,10 +107,6 @@ def compute_loss(
107107
chosen_scores, rejected_scores = chosen_scores.squeeze(), rejected_scores.squeeze()
108108

109109
loss = -torch.nn.functional.logsigmoid(chosen_scores.float() - rejected_scores.float()).mean()
110-
111-
if is_transformers_version_equal_to_4_46() and kwargs.get("num_items_in_batch"):
112-
loss /= self.args.gradient_accumulation_steps # fixes the loss value for transformers 4.46.0-4.46.1
113-
114110
if return_outputs:
115111
return loss, (loss, chosen_scores, rejected_scores)
116112
else:

src/llamafactory/train/sft/trainer.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
if TYPE_CHECKING:
3636
from torch.utils.data import Dataset
37-
from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
37+
from transformers import PreTrainedTokenizer, ProcessorMixin
3838
from transformers.trainer import PredictionOutput
3939

4040
from ...hparams import FinetuningArguments
@@ -88,24 +88,6 @@ def _get_train_sampler(self) -> Optional["torch.utils.data.Sampler"]:
8888

8989
return super()._get_train_sampler()
9090

91-
@override
92-
def compute_loss(
93-
self, model: "PreTrainedModel", inputs: Dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
94-
) -> Union["torch.Tensor", Tuple["torch.Tensor", List["torch.Tensor"]]]:
95-
r"""
96-
Fixes the loss value. See https://github.com/huggingface/transformers/pull/35438 for details.
97-
98-
It should be removed after https://github.com/huggingface/transformers/pull/35651 is merged.
99-
"""
100-
loss = super().compute_loss(model, inputs, return_outputs, **kwargs)
101-
if kwargs.get("num_items_in_batch") and not getattr(self, "model_accepts_loss_kwargs", False):
102-
if return_outputs:
103-
loss = (loss[0] / self.args.gradient_accumulation_steps, *loss[1:])
104-
else:
105-
loss = loss / self.args.gradient_accumulation_steps
106-
107-
return loss
108-
10991
@override
11092
def prediction_step(
11193
self,

src/llamafactory/webui/runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES
2525
from ..extras.misc import is_gpu_or_npu_available, torch_gc, use_ray
26-
from ..extras.packages import is_gradio_available, is_transformers_version_equal_to_4_46
26+
from ..extras.packages import is_gradio_available
2727
from .common import (
2828
DEFAULT_CACHE_DIR,
2929
DEFAULT_CONFIG_DIR,
@@ -180,7 +180,7 @@ def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
180180
plot_loss=True,
181181
trust_remote_code=True,
182182
ddp_timeout=180000000,
183-
include_num_input_tokens_seen=False if is_transformers_version_equal_to_4_46() else True, # FIXME
183+
include_num_input_tokens_seen=True,
184184
)
185185
args.update(json.loads(get("train.extra_args")))
186186

tests/model/model_utils/test_attention.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414

1515
import os
1616

17+
import pytest
1718
from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
1819

20+
from llamafactory.extras.packages import is_transformers_version_greater_than
1921
from llamafactory.train.test_utils import load_infer_model
2022

2123

@@ -27,6 +29,7 @@
2729
}
2830

2931

32+
@pytest.mark.xfail(is_transformers_version_greater_than("4.48"), reason="Attention refactor.")
3033
def test_attention():
3134
attention_available = ["disabled"]
3235
if is_torch_sdpa_available():

0 commit comments

Comments
 (0)