Skip to content

Commit 24fad68

Browse files
committed
Revert to "addressed comments"
This reverts commit 81cea10.
1 parent cede121 commit 24fad68

File tree

7 files changed

+41
-37
lines changed

7 files changed

+41
-37
lines changed

QEfficient/generation/text_generation_inference.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ class CloudAI100ExecInfo:
5757
perf_metrics: PerfMetrics
5858

5959
def __repr__(self):
60-
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
61-
\nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\
62-
\nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\
63-
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec"
60+
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
61+
\nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
62+
\nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
63+
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
6464

6565

6666
@dataclass
@@ -70,10 +70,10 @@ class CloudAI100ExecInfoNew:
7070
perf_metrics: PerfMetrics
7171

7272
def __repr__(self):
73-
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
74-
\nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} token/sec\
75-
\nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} token/sec\
76-
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec"
73+
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
74+
\nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
75+
\nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
76+
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
7777

7878

7979
io_files = []

QEfficient/transformers/modeling_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def _create_causal_mask(
308308
"""
309309
A utility attention mask class that allows one to:
310310
- Create a causal 4d mask
311-
- Create a causal 4d mask with sliding window
311+
- Create a causal 4d mask with slided window
312312
"""
313313
if sliding_window is not None:
314314
query_indices = position_ids.unsqueeze(-1)

QEfficient/transformers/models/internvl/modeling_internvl.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ class QEffInternVLModel(nn.Module):
1818
def get_specializations(
1919
self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
2020
):
21-
# TODO: check if this should be named num_patches or something else
22-
num_patches = compiler_options.get("num_patches", None)
23-
if num_patches is None:
21+
# TODO: check if this should be named num_crops or something else
22+
num_crops = compiler_options.get("num_crops", None)
23+
if num_crops is None:
2424
logger.warning(
25-
"User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
25+
"User should pass `num_crops` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
2626
)
27-
num_patches = 13
27+
num_crops = 13
2828

2929
prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840 # 4096-256
3030
ctx_len = ctx_len if ctx_len else 4096
@@ -39,14 +39,14 @@ def get_specializations(
3939
"batch_size": batch_size,
4040
"seq_len": prefill_seq_len,
4141
"ctx_len": ctx_len,
42-
"num_patches": num_patches,
42+
"num_crops": num_crops,
4343
"img_size": img_size,
4444
},
4545
{
4646
"batch_size": batch_size,
4747
"seq_len": "1",
4848
"ctx_len": ctx_len,
49-
"num_patches": num_patches,
49+
"num_crops": num_crops,
5050
"img_size": img_size,
5151
},
5252
]
@@ -58,7 +58,7 @@ def get_onnx_dynamic_axes(
5858
dynamic_axes = {}
5959
dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
6060
dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
61-
dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
61+
dynamic_axes["pixel_values"] = {0: "num_crops", 2: "img_size", 3: "img_size"}
6262

6363
pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
6464
for i in range(self.language_model.config.num_hidden_layers):
@@ -79,12 +79,12 @@ def get_output_names(
7979
def get_dummy_inputs(self, kv_offload: bool = False):
8080
if kv_offload:
8181
raise ValueError("kv_offload method not supported for InternVL yet!")
82-
num_patches = 13
82+
NUM_CROPS = 13
8383
C = 3
8484
if vis_cfg := getattr(self.config, "vision_config", None):
85-
img_size = getattr(vis_cfg, "image_size", 448)
85+
img_size = getattr(vis_cfg, "image_size", 336)
8686
else:
87-
img_size = 448
87+
img_size = 336
8888

8989
# Define shapes
9090
inputs_shapes = {}
@@ -93,7 +93,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
9393
constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
9494
constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
9595
)
96-
inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
96+
inputs_shapes["pixel_values"] = (NUM_CROPS, C, img_size, img_size)
9797

9898
# Define inputs
9999
inputs = {}
@@ -143,7 +143,7 @@ def get_inputs_info(self):
143143
return [
144144
IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
145145
IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
146-
IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_patches", 3, "img_size", "img_size")),
146+
IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_crops", 3, "img_size", "img_size")),
147147
]
148148

149149

QEfficient/transformers/models/llava/modeling_llava.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def get_dummy_inputs(self, **kwargs):
7878
def get_specializations(
7979
self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
8080
):
81+
# TODO: check if this should be named num_crops or something else
8182
max_num_images = compiler_options.get("max_num_images", 1)
8283
prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
8384
ctx_len = ctx_len if ctx_len else CTX_LEN

QEfficient/transformers/models/mllama/modeling_mllama.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,11 @@
4444
_prepare_aspect_ratio_attention_mask,
4545
_prepare_cross_attention_mask,
4646
)
47-
from QEfficient.utils import constants
4847
from QEfficient.utils._utils import IOInfo
4948

49+
CTX_LEN = 128
50+
SEQ_LEN = 32
51+
BS = 1
5052
MAX_NUM_IMG = 1
5153
NUM_CHANNEL = 3
5254

@@ -386,6 +388,9 @@ def forward(
386388
if attention_mask is not None: # no matter the length, we just slice it
387389
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
388390
attn_weights = attn_weights + causal_mask
391+
# attn_weights = torch.where(
392+
# attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights
393+
# )
389394

390395
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
391396
attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
@@ -1114,10 +1119,6 @@ def forward(
11141119
return outputs
11151120

11161121
def get_dummy_inputs(self, kv_offload: bool = False):
1117-
BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
1118-
SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
1119-
CTX_LEN = constants.ONNX_EXPORT_CTX_LEN
1120-
11211122
txt_cfg = self.config.get_text_config()
11221123
num_hidden_layers = txt_cfg.num_hidden_layers
11231124
cross_attention_layers = txt_cfg.cross_attention_layers
@@ -1191,9 +1192,11 @@ def get_specializations(
11911192
**compiler_options,
11921193
):
11931194
vis_cfg = self.config.vision_config
1195+
1196+
# TODO: check if this should be named num_crops or something else
11941197
max_num_images = compiler_options.get("max_num_images", 1)
1195-
prefill_seq_len = prefill_seq_len if prefill_seq_len else 32
1196-
ctx_len = ctx_len if ctx_len else 128
1198+
prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
1199+
ctx_len = ctx_len if ctx_len else CTX_LEN
11971200
if img_size is None and hasattr(vis_cfg, "image_size"):
11981201
img_size = getattr(vis_cfg, "image_size")
11991202
elif img_size is None:

QEfficient/transformers/models/modeling_auto.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
KVCacheModuleMethodMapperTransform,
3434
KVCacheTransform,
3535
SpDTransform,
36-
VlmKVOffloadTransform,
37-
VlmNoKVOffloadTransform,
36+
VlmKVOffloadTransorm,
37+
VlmNoKVOffloadTransorm,
3838
)
3939
from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
4040
from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
@@ -401,7 +401,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
401401
GPTQToMatmulNbitsTransform,
402402
CustomOpsTransform,
403403
KVCacheTransform,
404-
VlmKVOffloadTransform,
404+
VlmKVOffloadTransorm,
405405
]
406406
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
407407

@@ -454,7 +454,7 @@ def model_name(self) -> str:
454454
return mname
455455

456456

457-
class _QEffAutoModelForImageTextToTextDuaSingleQPC:
457+
class _QEffAutoModelForImageTextToText2QPC:
458458
UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
459459

460460
def __init__(
@@ -788,7 +788,7 @@ class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase):
788788
CustomOpsTransform,
789789
KVCacheTransform,
790790
KVCacheModuleMethodMapperTransform,
791-
VlmNoKVOffloadTransform,
791+
VlmNoKVOffloadTransorm,
792792
]
793793
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
794794

@@ -1128,7 +1128,7 @@ def __new__(self, model: nn.Module, kv_offload=False, **kwargs):
11281128
logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
11291129

11301130
if kv_offload:
1131-
return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs)
1131+
return _QEffAutoModelForImageTextToText2QPC(model, **kwargs)
11321132
else:
11331133
return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs)
11341134

QEfficient/transformers/models/pytorch_transforms.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,15 +365,15 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
365365
return model, transformed
366366

367367

368-
class VlmKVOffloadTransform(ModuleMappingTransform):
368+
class VlmKVOffloadTransorm(ModuleMappingTransform):
369369
# supported architectures
370370
_module_mapping = {
371371
# Llama
372372
MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC,
373373
}
374374

375375

376-
class VlmNoKVOffloadTransform(ModuleMappingTransform):
376+
class VlmNoKVOffloadTransorm(ModuleMappingTransform):
377377
# supported architectures
378378
_module_mapping = {
379379
# Llama

0 commit comments

Comments
 (0)