Spaces:

mr-don88
/

translate-subtitles

Sleeping

File size: 10,312 Bytes

import gradio as gr
from transformers import MarianMTModel, MarianTokenizer, pipeline
import pysrt
import tempfile
from tqdm import tqdm
from langdetect import detect
import os
from datetime import timedelta

# Danh sách các ngôn ngữ và model tương ứng
LANGUAGE_MODELS = {
    "Tiếng Anh": "en",
    "Tiếng Việt": "vi",
    "Tiếng Pháp": "fr",
    "Tiếng Đức": "de",
    "Tiếng Tây Ban Nha": "es",
    "Tiếng Bồ Đào Nha": "pt",
    "Tiếng Ý": "it",
    "Tiếng Nga": "ru",
    "Tiếng Hà Lan": "nl",
    "Tiếng Thụy Điển": "sv",
    "Tiếng Phần Lan": "fi",
    "Tiếng Đan Mạch": "da",
    "Tiếng Na Uy": "no",
    "Tiếng Ba Lan": "pl",
    "Tiếng Séc": "cs",
    "Tiếng Hungary": "hu",
    "Tiếng Romania": "ro",
    "Tiếng Hy Lạp": "el",
    "Tiếng Thổ Nhĩ Kỳ": "tr",
    "Tiếng Hindi": "hi",
    "Tiếng Ả Rập": "ar",
    "Tiếng Trung (Giản thể)": "zh",
    "Tiếng Nhật": "ja",
    "Tiếng Hàn": "ko"
}

# Đảo ngược dictionary để lấy code từ tên ngôn ngữ
LANGUAGE_CODES = {v: k for k, v in LANGUAGE_MODELS.items()}

# Cache models để tăng tốc độ
model_cache = {}
detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

def detect_subtitle_language(file_path):
    try:
        subs = pysrt.open(file_path)
        sample_text = " ".join([sub.text for sub in subs[:10] if sub.text.strip()])
        
        if not sample_text:
            return "en"  # Mặc định là tiếng Anh nếu không phát hiện được
        
        try:
            # Sử dụng langdetect cho đơn giản
            lang_code = detect(sample_text)
            return lang_code
        except:
            # Fallback sử dụng model xlm-roberta
            result = detector(sample_text[:512])[0]  # Giới hạn độ dài đầu vào
            return result['label'].split('__')[-1]
    except Exception as e:
        print(f"Error detecting language: {e}")
        return "en"

def get_model(source_lang, target_lang):
    model_key = f"{source_lang}-{target_lang}"
    
    if model_key not in model_cache:
        model_name = f"Helsinki-NLP/opus-mt-{model_key}"
        try:
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            model = MarianMTModel.from_pretrained(model_name)
            model_cache[model_key] = (model, tokenizer)
        except:
            # Fallback: Dịch qua tiếng Anh nếu không có model trực tiếp
            if source_lang != "en":
                # Dịch từ ngôn ngữ nguồn -> tiếng Anh -> ngôn ngữ đích
                model1_name = f"Helsinki-NLP/opus-mt-{source_lang}-en"
                model2_name = f"Helsinki-NLP/opus-mt-en-{target_lang}"
                
                tokenizer1 = MarianTokenizer.from_pretrained(model1_name)
                model1 = MarianMTModel.from_pretrained(model1_name)
                tokenizer2 = MarianTokenizer.from_pretrained(model2_name)
                model2 = MarianMTModel.from_pretrained(model2_name)
                
                model_cache[model_key] = ((model1, tokenizer1), (model2, tokenizer2))
            else:
                raise gr.Error(f"Không tìm thấy model dịch từ {source_lang} sang {target_lang}")
    
    return model_cache[model_key]

def translate_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

def add_time_to_subtitle(input_file, hours, minutes, seconds):
    if input_file is None:
        raise gr.Error("Vui lòng upload file phụ đề!")
    
    try:
        if not os.path.exists(input_file):
            raise gr.Error("File không tồn tại hoặc không thể đọc!")
        
        subs = pysrt.open(input_file)
        
        # Chuyển đổi thời gian nhập vào thành mili giây (hỗ trợ số thập phân)
        try:
            seconds_float = float(seconds)
        except ValueError:
            seconds_float = 0
            
        total_milliseconds = int((int(hours) * 3600 + int(minutes) * 60 + seconds_float) * 1000)
        
        # Thêm thời gian vào tất cả các phụ đề
        if total_milliseconds > 0:
            for sub in subs:
                sub.start.ordinal += total_milliseconds
                sub.end.ordinal += total_milliseconds
        
        # Lưu file tạm
        output_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
        subs.save(output_path, encoding='utf-8')
        return output_path, f"Đã thêm {hours}h {minutes}m {seconds_float}s vào file gốc"
    
    except Exception as e:
        raise gr.Error(f"Có lỗi xảy ra khi thêm thời gian: {str(e)}")

def translate_subtitle(input_file, source_language, target_language, hours, minutes, seconds):
    if input_file is None:
        raise gr.Error("Vui lòng upload file phụ đề!")
    
    try:
        if not os.path.exists(input_file):
            raise gr.Error("File không tồn tại hoặc không thể đọc!")
        
        source_code = LANGUAGE_MODELS.get(source_language, "en")
        target_code = LANGUAGE_MODELS[target_language]
        
        model_info = get_model(source_code, target_code)
        
        subs = pysrt.open(input_file)
        
        # Chuyển đổi thời gian nhập vào thành mili giây (hỗ trợ số thập phân)
        try:
            seconds_float = float(seconds)
        except ValueError:
            seconds_float = 0
            
        total_milliseconds = int((int(hours) * 3600 + int(minutes) * 60 + seconds_float) * 1000)
        
        # Thêm thời gian vào tất cả các phụ đề
        if total_milliseconds > 0:
            for sub in subs:
                sub.start.ordinal += total_milliseconds
                sub.end.ordinal += total_milliseconds
        
        # Xử lý dịch thuật
        if isinstance(model_info[0], tuple):
            # Dịch qua tiếng Anh
            model1, tokenizer1 = model_info[0]
            model2, tokenizer2 = model_info[1]
            
            for sub in tqdm(subs, desc="Đang dịch"):
                if sub.text.strip():
                    en_text = translate_text(sub.text, model1, tokenizer1)
                    sub.text = translate_text(en_text, model2, tokenizer2)
        else:
            # Dịch trực tiếp
            model, tokenizer = model_info
            for sub in tqdm(subs, desc="Đang dịch"):
                if sub.text.strip():
                    sub.text = translate_text(sub.text, model, tokenizer)
        
        # Lưu file tạm
        output_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False).name
        subs.save(output_path, encoding='utf-8')
        return output_path, f"Dịch từ {source_language} sang {target_language} thành công! Đã thêm {hours}h {minutes}m {seconds_float}s"
    
    except Exception as e:
        raise gr.Error(f"Có lỗi xảy ra: {str(e)}")

# Giao diện Gradio
with gr.Blocks(title="Subtitle Translator Pro", theme="soft") as demo:
    gr.Markdown("# 🎬 Subtitle Translator Pro")
    gr.Markdown("Dịch phụ đề (.srt) giữa nhiều ngôn ngữ khác nhau")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload file phụ đề (.srt)", file_types=[".srt"])
            
            with gr.Row():
                source_lang = gr.Dropdown(
                    choices=list(LANGUAGE_MODELS.keys()),
                    value="Tiếng Anh",
                    label="Ngôn ngữ nguồn",
                    interactive=True
                )
                auto_detect = gr.Checkbox(label="Tự động phát hiện ngôn ngữ", value=True)
            
            target_lang = gr.Dropdown(
                choices=list(LANGUAGE_MODELS.keys()),
                value="Tiếng Việt",
                label="Ngôn ngữ đích"
            )
            
            with gr.Row():
                hours = gr.Number(label="Giờ", value=0, precision=0, minimum=0)
                minutes = gr.Number(label="Phút", value=0, precision=0, minimum=0, maximum=59)
                seconds = gr.Number(label="Giây", value=0, minimum=0, step=0.1)
            
            with gr.Row():
                add_time_btn = gr.Button("Chỉ thêm thời gian", variant="secondary")
                translate_btn = gr.Button("Dịch phụ đề", variant="primary")
        
        with gr.Column():
            file_output = gr.File(label="File phụ đề đã xử lý", interactive=False)
            status = gr.Textbox(label="Trạng thái")
    
    # Xử lý khi upload file
    def on_file_upload(file, auto_detect_flag):
        if file and auto_detect_flag:
            try:
                lang_code = detect_subtitle_language(file.name)
                detected_lang = LANGUAGE_CODES.get(lang_code, "Tiếng Anh")
                return gr.Dropdown(value=detected_lang)
            except:
                return gr.Dropdown(value="Tiếng Anh")
        return gr.Dropdown()
    
    file_input.upload(
        fn=on_file_upload,
        inputs=[file_input, auto_detect],
        outputs=source_lang
    )
    
    # Xử lý khi nhấn nút thêm thời gian
    add_time_btn.click(
        fn=add_time_to_subtitle,
        inputs=[file_input, hours, minutes, seconds],
        outputs=[file_output, status]
    )
    
    # Xử lý khi nhấn nút dịch phụ đề
    translate_btn.click(
        fn=translate_subtitle,
        inputs=[file_input, source_lang, target_lang, hours, minutes, seconds],
        outputs=[file_output, status]
    )
    
    gr.Markdown("### Thông tin")
    gr.Markdown("""
    - Hỗ trợ định dạng .srt
    - Tự động phát hiện ngôn ngữ nguồn
    - Dịch giữa 24 ngôn ngữ khác nhau
    - Hỗ trợ dịch qua tiếng Anh nếu không có model trực tiếp
    - Thêm thời gian vào tất cả phụ đề (hỗ trợ giây thập phân)
    - Có nút riêng để chỉ thêm thời gian trước khi dịch
    """)

if __name__ == "__main__":
    demo.launch()