Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 35 additions & 37 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,50 @@
import os
import warnings

from QEfficient.utils import custom_format_warning

# For faster downloads via hf_transfer
# This code is put above import statements as this needs to be executed before
# hf_transfer is imported (will happen on line 15 via leading imports)
# hf_transfer is imported (will happen on line 14 via leading imports)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Placeholder for all non-transformer models registered in QEfficient
import QEfficient.utils.model_registery # noqa: F401
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform
from QEfficient.utils import custom_format_warning
from QEfficient.utils.logging_utils import logger

# custom warning for the better logging experience
warnings.formatwarning = custom_format_warning

# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]


def check_qaic_sdk():
"""Check if QAIC SDK is installed"""
Expand All @@ -36,38 +67,5 @@ def check_qaic_sdk():
return False


# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

if check_qaic_sdk():
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]

else:
if not check_qaic_sdk():
logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
82 changes: 53 additions & 29 deletions QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#
# -----------------------------------------------------------------------------

import platform
import sys
from pathlib import Path
from typing import Dict, List, Optional, Union
from warnings import warn
Expand All @@ -13,32 +15,29 @@

try:
import qaicrt

is_qaicrt_imported = True
except ImportError:
import platform
import sys
try:
sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt

sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt
is_qaicrt_imported = True
except ImportError:
is_qaicrt_imported = False

try:
import QAicApi_pb2 as aicapi
except ImportError:
import sys

sys.path.append("/opt/qti-aic/dev/python")
import QAicApi_pb2 as aicapi
is_aicapi_imported = True
except ImportError:
try:
sys.path.append("/opt/qti-aic/dev/python")
import QAicApi_pb2 as aicapi

aic_to_np_dtype_mapping = {
aicapi.FLOAT_TYPE: np.dtype(np.float32),
aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
aicapi.INT8_Q_TYPE: np.dtype(np.int8),
aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
aicapi.INT16_Q_TYPE: np.dtype(np.int16),
aicapi.INT32_Q_TYPE: np.dtype(np.int32),
aicapi.INT32_I_TYPE: np.dtype(np.int32),
aicapi.INT64_I_TYPE: np.dtype(np.int64),
aicapi.INT8_TYPE: np.dtype(np.int8),
}
is_aicapi_imported = True
except ImportError:
is_qaicrt_imported = False


class QAICInferenceSession:
Expand All @@ -51,13 +50,30 @@ def __init__(
):
"""
Initialise for QAIC inference Session
---------

:qpc_path: str. Path to the save generated binary file after compilation.
:device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup.
:activate: bool. If false, activation will be disabled. Default=True.
:enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
:param qpc_path: Path to the saved compiled QPC binary.
:param device_ids: Device IDs to be used; if > 1, enables multi-card setup.
:param activate: If False, activation will be skipped. Default=True.
:param enable_debug_logs: If True, enable debug logs. Default=False.
"""
if not (is_qaicrt_imported and is_aicapi_imported):
raise ImportError(
"Unable to import `qaicrt` and/or `QAicApi_pb2` libraries required for executing QPC files on the CLOUD AI platform.\n"
"Please ensure that the QAIC platform SDK and apps SDK are installed correctly."
)

# Build dtype mapping once (depends on aicapi constants)
self.aic_to_np_dtype_mapping = {
aicapi.FLOAT_TYPE: np.dtype(np.float32),
aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
aicapi.INT8_Q_TYPE: np.dtype(np.int8),
aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
aicapi.INT16_Q_TYPE: np.dtype(np.int16),
aicapi.INT32_Q_TYPE: np.dtype(np.int32),
aicapi.INT32_I_TYPE: np.dtype(np.int32),
aicapi.INT64_I_TYPE: np.dtype(np.int64),
aicapi.INT8_TYPE: np.dtype(np.int8),
}
# Load QPC
if device_ids is not None:
devices = qaicrt.QIDList(device_ids)
Expand All @@ -66,36 +82,44 @@ def __init__(
else:
self.context = qaicrt.Context()
self.queue = qaicrt.Queue(self.context, 0) # Async API

if enable_debug_logs:
if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to setLogLevel")

qpc = qaicrt.Qpc(str(qpc_path))

# Load IO Descriptor
iodesc = aicapi.IoDesc()
status, iodesc_data = qpc.getIoDescriptor()
if status != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to getIoDescriptor")
iodesc.ParseFromString(bytes(iodesc_data))

self.allowed_shapes = [
[(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
[(self.aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
for allowed_shape in iodesc.allowed_shapes
]
self.bindings = iodesc.selected_set.bindings
self.binding_index_map = {binding.name: binding.index for binding in self.bindings}

# Create and load Program
prog_properties = qaicrt.QAicProgramProperties()
prog_properties.SubmitRetryTimeoutMs = 60_000
if device_ids and len(device_ids) > 1:
prog_properties.devMapping = ":".join(map(str, device_ids))

self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to load program")

if activate:
self.activate()

# Create input qbuffers and buf_dims
self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = qaicrt.BufferDimensionsVecRef(
[(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
[(self.aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
)

@property
Expand Down Expand Up @@ -166,12 +190,12 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
# Run with async API
if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to enqueue")

if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS:
error_message = "Failed to run"
# Print additional error messages for unmatched dimension error
if self.allowed_shapes:
error_message += "\n\n"
error_message += '(Only if "No matching dimension found" error is present above)'
error_message += "\n\n(Only if 'No matching dimension found' error is present above)"
error_message += "\nAllowed shapes:"
for i, allowed_shape in enumerate(self.allowed_shapes):
error_message += f"\n{i}\n"
Expand Down Expand Up @@ -201,6 +225,6 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
continue
outputs[output_name] = np.frombuffer(
bytes(output_qbuffers[buffer_index]),
aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
self.aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
).reshape(self.buf_dims[buffer_index][1])
return outputs
Loading