mangaocr_demo / app.py
jzhang533's picture
update
97b3d3f
import gradio as gr
import re
from pathlib import Path
import jaconv
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoImageProcessor, AutoModelForVision2Seq
pretrained_model_name_or_path="jzhang533/manga-ocr-base-2025"
feature_extractor = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
model = AutoModelForVision2Seq.from_pretrained(pretrained_model_name_or_path)
def post_process(text):
text = "".join(text.split())
text = text.replace("…", "...")
text = re.sub("[・.]{2,}", lambda x: (x.end() - x.start()) * ".", text)
text = jaconv.h2z(text, ascii=True, digit=True)
return text
def inference(img_or_path):
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f"img_or_path must be a path or PIL.Image, instead got: {img_or_path}")
pixel_values = feature_extractor(img, return_tensors="pt").pixel_values
x = pixel_values.squeeze()
x = model.generate(x[None], max_length=300)[0].cpu()
x = tokenizer.decode(x, skip_special_tokens=True)
x = post_process(x)
return x
title = 'MangaOCR demo'
description = '''
- This is derived from : <https://github.com/kha-white/manga-ocr>
- The model being used : <https://huggingface.co/jzhang533/manga-ocr-base-2025> (trained using scripts in [kha-white/manga-ocr](https://github.com/kha-white/manga-ocr) with several tweaks)
- Dataset being used to train the model: [manga109-s](http://www.manga109.org/en/download_s.html) and synthetic data.
'''
examples = [
['00.jpg'],
['01.jpg'],
['02.jpg'],
['03.jpg'],
['04.jpg'],
['05.jpg'],
['06.jpg'],
['07.jpg'],
]
gr.Interface(
inference,
inputs=[
gr.Image(label="Upload Japanese Manga Image", type="filepath")
],
outputs="text",
title=title,
description=description,
examples=examples,
).launch()