Spaces:
Sleeping
Sleeping
# backend.py | |
import os | |
import torch | |
import re # Required for tokenization and splitting | |
from nemo.collections.asr.models import EncDecRNNTBPEModel | |
class KabyleASR: | |
def __init__(self): | |
self.device = "cpu" # Free tier uses CPU | |
self.model = None | |
print("Loading NeMo ASR model for Kabyle (CPU mode)...") | |
try: | |
# Load the pre-trained Kabyle Conformer Transducer model | |
self.model = EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large") | |
self.model = self.model.to(self.device) | |
# Optimize for CPU inference | |
self.model.preprocessor.featurizer.dither = 0.0 | |
self.model.preprocessor.featurizer.pad_to = 0 | |
print("Model loaded successfully.") | |
except Exception as e: | |
raise RuntimeError(f"Failed to load NeMo model: {str(e)}") | |
def post_process_kabyle_text(self, text): | |
""" | |
Corrects annexation in Kabyle transcription by replacing spaces with dashes. | |
This function implements a structured set of rules based on the provided | |
'Kabyle Transcription post script precessing rules v4.pdf' and other versions, | |
now incorporating the user's nuanced understanding of 'StPa' particles. | |
Args: | |
text (str): The raw transcribed text from the ASR model. | |
Returns: | |
str: The post-processed text with correct annexation dashes. | |
""" | |
# Dictionaries for a set of particles based on the rules document v4, with corrections. | |
CoPa = {'d', 'n', 'ur', 'i'} | |
PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'ntex', 'nwen', 'nwent', 'nsen', 'nsent', | |
'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 's', 'tneɣ', 'tentex', 'tsen', 'tsent'} | |
SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'ɣef', 'ddaw', 'nnig', 'ɣid', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'} | |
# Split StPa into a main group and a special group based on user feedback | |
StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'} | |
StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent', | |
'k', 'm', 'ntex', 'wen', 'went', 'sen', 'sent', 'akem', 'att', | |
'aken', 'akent', 'aten', 'atent'} | |
DePa = {'a', 'agi', 'nni', 'ihin', 'nniden'} | |
DiPa = {'id', 'in'} | |
FuPa = {'ad', 'ara'} | |
DiObPa = {'yi', 'k', 'kem', 't', 'tt', 'ay', 'ken', 'kent', 'ten', 'tent', | |
'iyi', 'ik', 'ikem', 'it', 'itt', 'iken', 'ikent', 'iten', 'itent'} | |
InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'} | |
# Combined set for general lookup, including both StPa groups | |
all_particles = CoPa.union(PoPro).union(SpWo).union(StPa).union(StPaSp).union(DePa).union(DiPa).union(FuPa).union(DiObPa).union(InObPa) | |
# The set of particles that can be annexed according to Rule 9, now correctly excluding StPaSp | |
rule_9_particles = DiObPa.union(InObPa).union(DiPa).union(StPa) | |
# The full set of state particles for other rules (like Rule 5 and 11) | |
full_stpa_set = StPa.union(StPaSp) | |
# Particles that can be part of the chain after FuPa (Rule 11) | |
rule_11_particles = DiObPa.union(DiPa).union(full_stpa_set) | |
# First, tokenize the text by splitting on spaces and existing dashes | |
tokens = re.split(r'[\s-]+', text.lower().strip()) | |
processed_tokens = [] | |
i = 0 | |
while i < len(tokens): | |
current_token = tokens[i] | |
# --- Rule 11: FuPa followed by an annexation chain tied to a word --- | |
if current_token in FuPa: | |
chain_start_idx = i + 1 | |
annexation_chain = [] | |
j = chain_start_idx | |
while j < len(tokens) and tokens[j] in rule_11_particles: | |
annexation_chain.append(tokens[j]) | |
j += 1 | |
if annexation_chain and j < len(tokens) and tokens[j] not in all_particles and len(tokens[j]) >= 2: | |
processed_tokens.append(current_token) | |
annexed_part = "-".join(annexation_chain) + "-" + tokens[j] | |
processed_tokens.append(annexed_part) | |
i = j + 1 | |
continue | |
# --- Rule 5: Annex StPa to SpWo --- | |
if current_token in SpWo and i + 1 < len(tokens) and tokens[i + 1] in full_stpa_set: | |
annexed_part = f"{current_token}-{tokens[i+1]}" | |
processed_tokens.append(annexed_part) | |
i += 2 | |
continue | |
# --- Rule 7: Annex DePa to a word of 2+ letters --- | |
is_regular_word = current_token not in all_particles | |
if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i+1] in DePa: | |
processed_tokens.append(f"{current_token}-{tokens[i+1]}") | |
i += 2 | |
continue | |
# --- Rule 3: Annex PoPro to a word of 2+ letters --- | |
is_regular_word = current_token not in all_particles | |
if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i+1] in PoPro: | |
processed_tokens.append(f"{current_token}-{tokens[i+1]}") | |
i += 2 | |
continue | |
# --- Rule 9: Annex a combination of particles to a regular word --- | |
is_regular_word_rule9 = current_token not in all_particles | |
if is_regular_word_rule9 and len(current_token) >= 2 and i + 1 < len(tokens): | |
annexation_chain = [] | |
j = i + 1 | |
while j < len(tokens) and tokens[j] in rule_9_particles: | |
annexation_chain.append(tokens[j]) | |
j += 1 | |
if annexation_chain: | |
processed_tokens.append(f"{current_token}-" + "-".join(annexation_chain)) | |
i = j | |
continue | |
# --- Handle all other tokens as they are (no annexation) --- | |
processed_tokens.append(current_token) | |
i += 1 | |
final_text = " ".join(processed_tokens) | |
return final_text | |
def transcribe(self, audio_file): | |
""" | |
Transcribe an audio file and apply Kabyle-specific post-processing. | |
Args: | |
audio_file (str): Path to the uploaded audio file | |
Returns: | |
str: Clean, grammatically improved transcription | |
""" | |
if not os.path.exists(audio_file): | |
return "Error: Audio file not found." | |
try: | |
# Transcribe using NeMo | |
with torch.no_grad(): | |
result = self.model.transcribe([audio_file], batch_size=1, num_workers=0) | |
# Extract text from Hypothesis object | |
hypothesis = result[0] | |
if hasattr(hypothesis, 'text'): | |
raw_text = hypothesis.text.strip() | |
else: | |
raw_text = str(hypothesis).strip() | |
if not raw_text: | |
return "Transcription returned no text." | |
# Apply Kabyle grammar post-processing | |
final_text = self.post_process_kabyle_text(raw_text) | |
return final_text | |
except Exception as e: | |
return f"Transcription error: {str(e)}" |