donedd
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ MODEL_NAME = "ai4bharat/indictrans2-indic-indic-1B"
|
|
8 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
9 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
10 |
|
11 |
-
# Supported languages: full name ->
|
12 |
LANGUAGES = {
|
13 |
-
|
14 |
"Bengali": "ben",
|
15 |
"Gujarati": "guj",
|
16 |
"Hindi": "hin",
|
@@ -34,11 +34,9 @@ def translate(text: str, src_lang_name: str, tgt_lang_name: str) -> str:
|
|
34 |
src_lang = LANGUAGES[src_lang_name]
|
35 |
tgt_lang = LANGUAGES[tgt_lang_name]
|
36 |
|
37 |
-
# Format input as required by IndicTrans2
|
38 |
formatted_text = f"{src_lang}>>{tgt_lang} {text}"
|
39 |
inputs = tokenizer(formatted_text, return_tensors="pt")
|
40 |
|
41 |
-
# Generate translations
|
42 |
output_tokens = model.generate(**inputs, max_length=512)
|
43 |
translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
44 |
|
|
|
8 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
9 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
10 |
|
11 |
+
# Supported languages: full name -> code
|
12 |
LANGUAGES = {
|
13 |
+
"Assamese": "asm",
|
14 |
"Bengali": "ben",
|
15 |
"Gujarati": "guj",
|
16 |
"Hindi": "hin",
|
|
|
34 |
src_lang = LANGUAGES[src_lang_name]
|
35 |
tgt_lang = LANGUAGES[tgt_lang_name]
|
36 |
|
|
|
37 |
formatted_text = f"{src_lang}>>{tgt_lang} {text}"
|
38 |
inputs = tokenizer(formatted_text, return_tensors="pt")
|
39 |
|
|
|
40 |
output_tokens = model.generate(**inputs, max_length=512)
|
41 |
translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
42 |
|