diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index be4b86321..8e824b488 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -8,19 +8,50 @@ import os import warnings -from QEfficient.utils import custom_format_warning - # For faster downloads via hf_transfer # This code is put above import statements as this needs to be executed before -# hf_transfer is imported (will happen on line 15 via leading imports) +# hf_transfer is imported (will happen on line 14 via leading imports) os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + # Placeholder for all non-transformer models registered in QEfficient import QEfficient.utils.model_registery # noqa: F401 +from QEfficient.base import ( + QEFFAutoModel, + QEFFAutoModelForCausalLM, + QEFFAutoModelForImageTextToText, + QEFFAutoModelForSpeechSeq2Seq, + QEFFCommonLoader, +) +from QEfficient.compile.compile_helper import compile +from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.peft import QEffAutoPeftModelForCausalLM +from QEfficient.transformers.transform import transform +from QEfficient.utils import custom_format_warning from QEfficient.utils.logging_utils import logger # custom warning for the better logging experience warnings.formatwarning = custom_format_warning +# Conditionally import QAIC-related modules if the SDK is installed +__version__ = "0.0.1.dev0" + +# Users can use QEfficient.export for exporting models to ONNX +export = qualcomm_efficient_converter + +__all__ = [ + "transform", + "export", + "compile", + "cloud_ai_100_exec_kv", + "QEFFAutoModel", + "QEFFAutoModelForCausalLM", + "QEffAutoPeftModelForCausalLM", + "QEFFAutoModelForImageTextToText", + "QEFFAutoModelForSpeechSeq2Seq", + "QEFFCommonLoader", +] + def check_qaic_sdk(): """Check if QAIC SDK is installed""" @@ -36,38 +67,5 @@ def check_qaic_sdk(): return False -# Conditionally import QAIC-related modules if the SDK is installed -__version__ = "0.0.1.dev0" - -if check_qaic_sdk(): - from QEfficient.base import ( - QEFFAutoModel, - QEFFAutoModelForCausalLM, - QEFFAutoModelForImageTextToText, - QEFFAutoModelForSpeechSeq2Seq, - QEFFCommonLoader, - ) - from QEfficient.compile.compile_helper import compile - from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter - from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv - from QEfficient.peft import QEffAutoPeftModelForCausalLM - from QEfficient.transformers.transform import transform - - # Users can use QEfficient.export for exporting models to ONNX - export = qualcomm_efficient_converter - - __all__ = [ - "transform", - "export", - "compile", - "cloud_ai_100_exec_kv", - "QEFFAutoModel", - "QEFFAutoModelForCausalLM", - "QEffAutoPeftModelForCausalLM", - "QEFFAutoModelForImageTextToText", - "QEFFAutoModelForSpeechSeq2Seq", - "QEFFCommonLoader", - ] - -else: +if not check_qaic_sdk(): logger.warning("QAIC SDK is not installed, eager mode features won't be available!") diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 8519d824c..4acc97787 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -5,6 +5,8 @@ # # ----------------------------------------------------------------------------- +import platform +import sys from pathlib import Path from typing import Dict, List, Optional, Union from warnings import warn @@ -13,32 +15,29 @@ try: import qaicrt + + is_qaicrt_imported = True except ImportError: - import platform - import sys + try: + sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") + import qaicrt - sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}") - import qaicrt + is_qaicrt_imported = True + except ImportError: + is_qaicrt_imported = False try: import QAicApi_pb2 as aicapi -except ImportError: - import sys - sys.path.append("/opt/qti-aic/dev/python") - import QAicApi_pb2 as aicapi + is_aicapi_imported = True +except ImportError: + try: + sys.path.append("/opt/qti-aic/dev/python") + import QAicApi_pb2 as aicapi -aic_to_np_dtype_mapping = { - aicapi.FLOAT_TYPE: np.dtype(np.float32), - aicapi.FLOAT_16_TYPE: np.dtype(np.float16), - aicapi.INT8_Q_TYPE: np.dtype(np.int8), - aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), - aicapi.INT16_Q_TYPE: np.dtype(np.int16), - aicapi.INT32_Q_TYPE: np.dtype(np.int32), - aicapi.INT32_I_TYPE: np.dtype(np.int32), - aicapi.INT64_I_TYPE: np.dtype(np.int64), - aicapi.INT8_TYPE: np.dtype(np.int8), -} + is_aicapi_imported = True + except ImportError: + is_qaicrt_imported = False class QAICInferenceSession: @@ -51,13 +50,30 @@ def __init__( ): """ Initialise for QAIC inference Session - --------- - :qpc_path: str. Path to the save generated binary file after compilation. - :device_ids: List[int]. Device Ids to be used for compilation. if devices > 1, it enables multiple card setup. - :activate: bool. If false, activation will be disabled. Default=True. - :enable_debug_logs: bool. If True, It will enable debug logs. Default=False. + :param qpc_path: Path to the saved compiled QPC binary. + :param device_ids: Device IDs to be used; if > 1, enables multi-card setup. + :param activate: If False, activation will be skipped. Default=True. + :param enable_debug_logs: If True, enable debug logs. Default=False. """ + if not (is_qaicrt_imported and is_aicapi_imported): + raise ImportError( + "Unable to import `qaicrt` and/or `QAicApi_pb2` libraries required for executing QPC files on the CLOUD AI platform.\n" + "Please ensure that the QAIC platform SDK and apps SDK are installed correctly." + ) + + # Build dtype mapping once (depends on aicapi constants) + self.aic_to_np_dtype_mapping = { + aicapi.FLOAT_TYPE: np.dtype(np.float32), + aicapi.FLOAT_16_TYPE: np.dtype(np.float16), + aicapi.INT8_Q_TYPE: np.dtype(np.int8), + aicapi.UINT8_Q_TYPE: np.dtype(np.uint8), + aicapi.INT16_Q_TYPE: np.dtype(np.int16), + aicapi.INT32_Q_TYPE: np.dtype(np.int32), + aicapi.INT32_I_TYPE: np.dtype(np.int32), + aicapi.INT64_I_TYPE: np.dtype(np.int64), + aicapi.INT8_TYPE: np.dtype(np.int8), + } # Load QPC if device_ids is not None: devices = qaicrt.QIDList(device_ids) @@ -66,36 +82,44 @@ def __init__( else: self.context = qaicrt.Context() self.queue = qaicrt.Queue(self.context, 0) # Async API + if enable_debug_logs: if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to setLogLevel") + qpc = qaicrt.Qpc(str(qpc_path)) + # Load IO Descriptor iodesc = aicapi.IoDesc() status, iodesc_data = qpc.getIoDescriptor() if status != qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to getIoDescriptor") iodesc.ParseFromString(bytes(iodesc_data)) + self.allowed_shapes = [ - [(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] + [(self.aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes] for allowed_shape in iodesc.allowed_shapes ] self.bindings = iodesc.selected_set.bindings self.binding_index_map = {binding.name: binding.index for binding in self.bindings} + # Create and load Program prog_properties = qaicrt.QAicProgramProperties() prog_properties.SubmitRetryTimeoutMs = 60_000 if device_ids and len(device_ids) > 1: prog_properties.devMapping = ":".join(map(str, device_ids)) + self.program = qaicrt.Program(self.context, None, qpc, prog_properties) if self.program.load() != qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to load program") + if activate: self.activate() + # Create input qbuffers and buf_dims self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] self.buf_dims = qaicrt.BufferDimensionsVecRef( - [(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] + [(self.aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings] ) @property @@ -166,12 +190,12 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: # Run with async API if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS: raise MemoryError("Failed to enqueue") + if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS: error_message = "Failed to run" # Print additional error messages for unmatched dimension error if self.allowed_shapes: - error_message += "\n\n" - error_message += '(Only if "No matching dimension found" error is present above)' + error_message += "\n\n(Only if 'No matching dimension found' error is present above)" error_message += "\nAllowed shapes:" for i, allowed_shape in enumerate(self.allowed_shapes): error_message += f"\n{i}\n" @@ -201,6 +225,6 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: continue outputs[output_name] = np.frombuffer( bytes(output_qbuffers[buffer_index]), - aic_to_np_dtype_mapping[self.bindings[buffer_index].type], + self.aic_to_np_dtype_mapping[self.bindings[buffer_index].type], ).reshape(self.buf_dims[buffer_index][1]) return outputs