Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# -----------------------------------------------------------------------------

from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
Expand All @@ -21,7 +21,7 @@
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEffAutoModel",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFCommonLoader",
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# -----------------------------------------------------------------------------

from QEfficient.base.common import QEFFCommonLoader # noqa: F401
from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401
from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM # noqa: F401
2 changes: 1 addition & 1 deletion QEfficient/base/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None:
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
"""
Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
"""
if not os.path.isdir(pretrained_model_name_or_path):
pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
Expand Down
325 changes: 279 additions & 46 deletions QEfficient/transformers/models/modeling_auto.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion QEfficient/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_models_dir():
ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
ONNX_EXPORT_EXAMPLE_FBS = 4
ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep
ONNX_EXPORT_OPSET = 13
ONNX_EXPORT_OPSET = 14

COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]

Expand Down
7 changes: 6 additions & 1 deletion docs/source/hl_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
:member-order: bysource
:members:
```

## `QEFFAutoModel`
```{eval-rst}
.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
:member-order: bysource
:members:
```
## `QEffAutoPeftModelForCausalLM`
```{eval-rst}
.. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
Expand Down
105 changes: 105 additions & 0 deletions tests/transformers/models/test_embedding_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------


import numpy as np
import onnxruntime as ort
import pytest
import torch
from transformers import AutoModel, AutoTokenizer

from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
from QEfficient.utils import hf_download
from QEfficient.utils.constants import Constants

embed_test_models = [
# model_name, architecture
"sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM
"BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification
"BAAI/bge-small-en-v1.5", # BertModel
]


def check_embed_pytorch_vs_ort_vs_ai100(
model_name: str,
seq_len: int = Constants.CTX_LEN,
n_layer: int = 1,
):
model_path = hf_download(
repo_id=model_name,
ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
)
# Prepare input
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer("My name is", return_tensors="pt")

input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0)
attention_mask = torch.nn.functional.pad(
inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0
)
inputs = dict(input_ids=input_ids, attention_mask=attention_mask)

# Original PyTorch model
pt_model = AutoModel.from_pretrained(
model_path,
num_hidden_layers=n_layer,
attn_implementation="eager",
trust_remote_code=True,
)

pt_outputs = pt_model(**inputs)
pt_embeddings = pt_outputs[0][0].detach().numpy()

# Pytorch transformed model
qeff_model = QEFFAutoModel.from_pretrained(
pretrained_model_name_or_path=model_path,
num_hidden_layers=n_layer,
attn_implementation="eager",
trust_remote_code=True,
)
qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"

onnx_model = qeff_model.export()
ort_session = ort.InferenceSession(str(onnx_model))

# Prepare the inputs for ONNX Runtime
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)

onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
# Run inference
onnx_outputs = ort_session.run(None, onnx_inputs)

# Compare Transformed PyTorch and ONNX outputs
pt_embeddings = pt_outputs[0][0].detach().numpy()
onnx_embeddings = onnx_outputs[0]
mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
print("Mad for onnx and PyTorch is ", mad)
assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"

qeff_model.compile(
num_cores=14,
)
ai100_output = qeff_model.generate(inputs=inputs)

# Compare ONNX and AI 100 outputs
mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
print("Mad for onnx and AI 100 output is ", mad)
assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"


@pytest.mark.on_qaic
@pytest.mark.parametrize("model_name", embed_test_models)
def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
"""
Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
"""
check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
Loading