Skip to content
Merged
39 changes: 25 additions & 14 deletions docs/source/en/model_doc/got_ocr2.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,14 @@ The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
>>> inputs = processor(image, return_tensors="pt").to(device)
>>> inputs = processor(image, return_tensors="pt", device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -68,15 +69,16 @@ The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
>>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"

>>> inputs = processor([image1, image2], return_tensors="pt").to(device)
>>> inputs = processor([image1, image2], return_tensors="pt", device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -96,13 +98,14 @@ GOT-OCR2 can also generate formatted text, such as markdown or LaTeX. Here is an

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/latex.png"
>>> inputs = processor(image, return_tensors="pt", format=True).to(device)
>>> inputs = processor(image, return_tensors="pt", format=True, device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -124,14 +127,15 @@ Here is an example of how to process multiple pages at once:

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page1.png"
>>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page2.png"
>>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True).to(device)
>>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True, device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -153,13 +157,14 @@ Here is an example of how to process cropped patches:
```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", torch_dtype=torch.bfloat16, device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
>>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3).to(device)
>>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3, device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -179,13 +184,14 @@ GOT supports interactive OCR, where the user can specify the region to be recogn

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
>>> inputs = processor(image, return_tensors="pt", color="green").to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)
>>> inputs = processor(image, return_tensors="pt", color="green", device=device).to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)

>>> generate_ids = model.generate(
... **inputs,
Expand All @@ -206,14 +212,15 @@ Here is an example of how to process sheet music:

```python
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> import torch
>>> import verovio

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)

>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/sheet_music.png"
>>> inputs = processor(image, return_tensors="pt", format=True).to(device)
>>> inputs = processor(image, return_tensors="pt", format=True, device=device).to(device)

>>> generate_ids = model.generate(
... **inputs,
Expand Down Expand Up @@ -258,6 +265,10 @@ alt="drawing" width="600"/>

[[autodoc]] GotOcr2ImageProcessor

## GotOcr2ImageProcessorFast

[[autodoc]] GotOcr2ImageProcessorFast

## GotOcr2Processor

[[autodoc]] GotOcr2Processor
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,7 @@
_import_structure["models.deit"].append("DeiTImageProcessorFast")
_import_structure["models.depth_pro"].append("DepthProImageProcessorFast")
_import_structure["models.detr"].append("DetrImageProcessorFast")
_import_structure["models.got_ocr2"].append("GotOcr2ImageProcessorFast")
_import_structure["models.llava"].append("LlavaImageProcessorFast")
_import_structure["models.llava_next"].append("LlavaNextImageProcessorFast")
_import_structure["models.llava_onevision"].append("LlavaOnevisionImageProcessorFast")
Expand Down Expand Up @@ -6502,6 +6503,7 @@
from .models.deit import DeiTImageProcessorFast
from .models.depth_pro import DepthProImageProcessorFast
from .models.detr import DetrImageProcessorFast
from .models.got_ocr2 import GotOcr2ImageProcessorFast
from .models.llava import LlavaImageProcessorFast
from .models.llava_next import LlavaNextImageProcessorFast
from .models.llava_onevision import LlavaOnevisionImageProcessorFast
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
("fuyu", ("FuyuImageProcessor",)),
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("glpn", ("GLPNImageProcessor",)),
("got_ocr2", ("GotOcr2ImageProcessor",)),
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
("grounding-dino", ("GroundingDinoImageProcessor",)),
("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("hiera", ("BitImageProcessor",)),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/got_ocr2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
if TYPE_CHECKING:
from .configuration_got_ocr2 import *
from .image_processing_got_ocr2 import *
from .image_processing_got_ocr2_fast import *
from .modeling_got_ocr2 import *
from .processing_got_ocr2 import *

Expand Down
Loading