Skip to content

Commit 0997c2f

Browse files
Fix doc for PerceptionLMForConditionalGeneration forward. (#40733)
* Fix doc for PerceptionLMForConditionalGeneration forward. * fix last nit --------- Co-authored-by: raushan <[email protected]>
1 parent a72e5a4 commit 0997c2f

File tree

2 files changed

+85
-16
lines changed

2 files changed

+85
-16
lines changed

src/transformers/models/perception_lm/modeling_perception_lm.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -374,23 +374,44 @@ def forward(
374374
Example:
375375
376376
```python
377-
>>> from PIL import Image
378-
>>> import requests
379-
>>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
380-
381-
>>> model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
382-
>>> processor = AutoProcessor.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
383-
384-
>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
385-
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
386-
>>> image = Image.open(requests.get(url, stream=True).raw)
387-
388-
>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
377+
from transformers import AutoProcessor, AutoModelForImageTextToText
378+
from huggingface_hub import hf_hub_download
379+
380+
MODEL_PATH = "facebook/Perception-LM-1B"
381+
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
382+
model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
383+
test_image_file = hf_hub_download(
384+
repo_id="shumingh/perception_lm_test_images",
385+
filename="14496_0.PNG",
386+
repo_type="dataset",
387+
)
388+
conversation = [
389+
{
390+
"role": "user",
391+
"content": [
392+
{
393+
"type": "image",
394+
"url": test_image_file,
395+
},
396+
{"type": "text", "text": "Describe the bar plot in the image."},
397+
],
398+
}
399+
]
400+
401+
inputs = processor.apply_chat_template(
402+
[conversation],
403+
add_generation_prompt=True,
404+
tokenize=True,
405+
return_dict=True,
406+
return_tensors="pt",
407+
)
408+
inputs = inputs.to(model.device)
409+
generate_ids = model.generate(**inputs, max_new_tokens=256)
410+
input_length = inputs["input_ids"].shape[1]
411+
generate_ids_without_inputs = generate_ids[:, input_length:]
389412
390-
>>> # Generate
391-
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
392-
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
393-
"USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
413+
for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
414+
print(output)
394415
```"""
395416
outputs = self.model(
396417
input_ids=input_ids,

src/transformers/models/perception_lm/modular_perception_lm.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,54 @@ def forward(
335335
logits_to_keep: Union[int, torch.Tensor] = 0,
336336
**lm_kwargs,
337337
) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
338+
r"""
339+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
340+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
341+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
342+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
343+
344+
Example:
345+
346+
```python
347+
from transformers import AutoProcessor, AutoModelForImageTextToText
348+
from huggingface_hub import hf_hub_download
349+
350+
MODEL_PATH = "facebook/Perception-LM-1B"
351+
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
352+
model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
353+
test_image_file = hf_hub_download(
354+
repo_id="shumingh/perception_lm_test_images",
355+
filename="14496_0.PNG",
356+
repo_type="dataset",
357+
)
358+
conversation = [
359+
{
360+
"role": "user",
361+
"content": [
362+
{
363+
"type": "image",
364+
"url": test_image_file,
365+
},
366+
{"type": "text", "text": "Describe the bar plot in the image."},
367+
],
368+
}
369+
]
370+
371+
inputs = processor.apply_chat_template(
372+
[conversation],
373+
add_generation_prompt=True,
374+
tokenize=True,
375+
return_dict=True,
376+
return_tensors="pt",
377+
)
378+
inputs = inputs.to(model.device)
379+
generate_ids = model.generate(**inputs, max_new_tokens=256)
380+
input_length = inputs["input_ids"].shape[1]
381+
generate_ids_without_inputs = generate_ids[:, input_length:]
382+
383+
for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
384+
print(output)
385+
```"""
338386
outputs = self.model(
339387
input_ids=input_ids,
340388
pixel_values=pixel_values,

0 commit comments

Comments
 (0)