Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 37 additions & 16 deletions src/transformers/models/perception_lm/modeling_perception_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,23 +374,44 @@ def forward(
Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration

>>> model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
from transformers import AutoProcessor, AutoModelForImageTextToText
from huggingface_hub import hf_hub_download

MODEL_PATH = "facebook/Perception-LM-1B"
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
test_image_file = hf_hub_download(
repo_id="shumingh/perception_lm_test_images",
filename="14496_0.PNG",
repo_type="dataset",
)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
"url": test_image_file,
},
{"type": "text", "text": "Describe the bar plot in the image."},
],
}
]

inputs = processor.apply_chat_template(
[conversation],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=256)
input_length = inputs["input_ids"].shape[1]
generate_ids_without_inputs = generate_ids[:, input_length:]

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
print(output)
```"""
outputs = self.model(
input_ids=input_ids,
Expand Down
48 changes: 48 additions & 0 deletions src/transformers/models/perception_lm/modular_perception_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,54 @@ def forward(
logits_to_keep: Union[int, torch.Tensor] = 0,
**lm_kwargs,
) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
from transformers import AutoProcessor, AutoModelForImageTextToText
from huggingface_hub import hf_hub_download

MODEL_PATH = "facebook/Perception-LM-1B"
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
test_image_file = hf_hub_download(
repo_id="shumingh/perception_lm_test_images",
filename="14496_0.PNG",
repo_type="dataset",
)
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
"url": test_image_file,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to download, we can put image link here (https://huggingface.co/datasets/shumingh/perception_lm_test_images/resolve/main/14496_0.PNG) and it will also work

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. let me test this out. I've already left Meta, let me find a GPU somewhere lol.

},
{"type": "text", "text": "Describe the bar plot in the image."},
],
}
]

inputs = processor.apply_chat_template(
[conversation],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=256)
input_length = inputs["input_ids"].shape[1]
generate_ids_without_inputs = generate_ids[:, input_length:]

for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
print(output)
```"""
outputs = self.model(
input_ids=input_ids,
pixel_values=pixel_values,
Expand Down