Skip to content

Commit e327ca3

Browse files
committed
Test: Fix Llama injection test and use a smaller model
The original Llama injection test was using a large 7B model, which is not suitable for CI. This commit modifies the test to: 1. Use a manually created, small Llama model with a standard configuration. 2. This new approach successfully reproduces the original bug and passes after the fix. Signed-off-by: huanyuqu <[email protected]>
1 parent ca4036f commit e327ca3

File tree

1 file changed

+22
-2
lines changed

1 file changed

+22
-2
lines changed

tests/unit/inference/test_inference.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from transformers import pipeline
2525
from transformers.models.t5.modeling_t5 import T5Block
2626
from transformers.models.roberta.modeling_roberta import RobertaLayer
27+
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
2728

2829
from deepspeed.accelerator import get_accelerator
2930
from deepspeed.git_version_info import torch_info
@@ -554,7 +555,7 @@ def test(self, model_w_task, injection_policy, query, inf_kwargs, assert_fn, dty
554555

555556

556557
@pytest.mark.seq_inference
557-
@pytest.mark.parametrize("model_w_task", [("meta-llama/Llama-2-7b-hf", "text-generation")], ids=["llama"])
558+
@pytest.mark.parametrize("model_w_task", [("hf-internal-testing/tiny-random-LlamaForCausalLM", "text-generation")], ids=["llama"])
558559
@pytest.mark.parametrize("dtype", [torch.half], ids=["fp16"])
559560
class TestLlamaInjection(DistributedTest):
560561
world_size = 1
@@ -571,12 +572,31 @@ def test(self, model_w_task, dtype, query, inf_kwargs, assert_fn):
571572
pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
572573

573574
model, task = model_w_task
575+
576+
tokenizer = LlamaTokenizer.from_pretrained(model)
577+
config = LlamaConfig(
578+
vocab_size=32000,
579+
hidden_size=4096,
580+
intermediate_size=11008,
581+
num_hidden_layers=1,
582+
num_attention_heads=32,
583+
max_position_embeddings=2048,
584+
initializer_range=0.02,
585+
rms_norm_eps=1e-5,
586+
use_cache=True,
587+
pad_token_id=0,
588+
bos_token_id=1,
589+
eos_token_id=2,
590+
tie_word_embeddings=True
591+
)
592+
model = LlamaForCausalLM(config)
574593

575594
local_rank = int(os.getenv("LOCAL_RANK", "0"))
576595
device = torch.device(get_accelerator().device_name(local_rank))
577596

578597
pipe = pipeline(task,
579598
model=model,
599+
tokenizer=tokenizer,
580600
device=torch.device("cpu"),
581601
model_kwargs={"low_cpu_mem_usage": True},
582602
framework="pt")
@@ -597,7 +617,7 @@ def test(self, model_w_task, dtype, query, inf_kwargs, assert_fn):
597617
)
598618
check_injection(pipe.model)
599619
except AttributeError as e:
600-
if "'LlamaAttention' object has no attribute 'num_heads'" in e:
620+
if "'LlamaAttention' object has no attribute 'num_heads'" in str(e):
601621
pytest.skip("Skipping due to transformers version compatibility issue with self-attention")
602622
raise e
603623

0 commit comments

Comments
 (0)