@@ -374,23 +374,44 @@ def forward(
374
374
Example:
375
375
376
376
```python
377
- >>> from PIL import Image
378
- >>> import requests
379
- >>> from transformers import AutoProcessor, PerceptionLMForConditionalGeneration
380
-
381
- >>> model = PerceptionLMForConditionalGeneration.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
382
- >>> processor = AutoProcessor.from_pretrained("perception_lm-hf/perception_lm-1.5-7b-hf")
383
-
384
- >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
385
- >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
386
- >>> image = Image.open(requests.get(url, stream=True).raw)
387
-
388
- >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
377
+ from transformers import AutoProcessor, AutoModelForImageTextToText
378
+ from huggingface_hub import hf_hub_download
379
+
380
+ MODEL_PATH = "facebook/Perception-LM-1B"
381
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
382
+ model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
383
+ test_image_file = hf_hub_download(
384
+ repo_id="shumingh/perception_lm_test_images",
385
+ filename="14496_0.PNG",
386
+ repo_type="dataset",
387
+ )
388
+ conversation = [
389
+ {
390
+ "role": "user",
391
+ "content": [
392
+ {
393
+ "type": "image",
394
+ "url": test_image_file,
395
+ },
396
+ {"type": "text", "text": "Describe the bar plot in the image."},
397
+ ],
398
+ }
399
+ ]
400
+
401
+ inputs = processor.apply_chat_template(
402
+ [conversation],
403
+ add_generation_prompt=True,
404
+ tokenize=True,
405
+ return_dict=True,
406
+ return_tensors="pt",
407
+ )
408
+ inputs = inputs.to(model.device)
409
+ generate_ids = model.generate(**inputs, max_new_tokens=256)
410
+ input_length = inputs["input_ids"].shape[1]
411
+ generate_ids_without_inputs = generate_ids[:, input_length:]
389
412
390
- >>> # Generate
391
- >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
392
- >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
393
- "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
413
+ for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
414
+ print(output)
394
415
```"""
395
416
outputs = self .model (
396
417
input_ids = input_ids ,
0 commit comments