Spaces:

AitBAD
/

kab-asr-tanti

Sleeping

kab-asr-tanti / backend.py

Bouaziz-bad

Backend : Included the text post-processing

c2bf59a about 1 month ago

7.33 kB

	# backend.py
	import os
	import torch
	import re # Required for tokenization and splitting
	from nemo.collections.asr.models import EncDecRNNTBPEModel

	class KabyleASR:
	def __init__(self):
	self.device = "cpu" # Free tier uses CPU
	self.model = None
	print("Loading NeMo ASR model for Kabyle (CPU mode)...")
	try:
	# Load the pre-trained Kabyle Conformer Transducer model
	self.model = EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large")
	self.model = self.model.to(self.device)

	# Optimize for CPU inference
	self.model.preprocessor.featurizer.dither = 0.0
	self.model.preprocessor.featurizer.pad_to = 0
	print("Model loaded successfully.")
	except Exception as e:
	raise RuntimeError(f"Failed to load NeMo model: {str(e)}")

	def post_process_kabyle_text(self, text):
	"""
	Corrects annexation in Kabyle transcription by replacing spaces with dashes.
	This function implements a structured set of rules based on the provided
	'Kabyle Transcription post script precessing rules v4.pdf' and other versions,
	now incorporating the user's nuanced understanding of 'StPa' particles.
	Args:
	text (str): The raw transcribed text from the ASR model.
	Returns:
	str: The post-processed text with correct annexation dashes.
	"""
	# Dictionaries for a set of particles based on the rules document v4, with corrections.
	CoPa = {'d', 'n', 'ur', 'i'}
	PoPro = {'inu', 'inem', 'ines', 'nneɣ', 'ntex', 'nwen', 'nwent', 'nsen', 'nsent',
	'iw', 'ik', 'im', 'is', 'w', 'k', 'm', 's', 'tneɣ', 'tentex', 'tsen', 'tsent'}
	SpWo = {'deg', 'gar', 'ɣer', 'ɣur', 'fell', 'ɣef', 'ddaw', 'nnig', 'ɣid', 'aql', 'sɣur', 'sennig', 'deffir', 'sdat'}
	# Split StPa into a main group and a special group based on user feedback
	StPaSp = {'i', 'am', 'at', 's', 'neɣ', 'aɣ'}
	StPa = {'ak', 'as', 'aneɣ', 'anteɣ', 'awen', 'awent', 'asen', 'asent',
	'k', 'm', 'ntex', 'wen', 'went', 'sen', 'sent', 'akem', 'att',
	'aken', 'akent', 'aten', 'atent'}
	DePa = {'a', 'agi', 'nni', 'ihin', 'nniden'}
	DiPa = {'id', 'in'}
	FuPa = {'ad', 'ara'}
	DiObPa = {'yi', 'k', 'kem', 't', 'tt', 'ay', 'ken', 'kent', 'ten', 'tent',
	'iyi', 'ik', 'ikem', 'it', 'itt', 'iken', 'ikent', 'iten', 'itent'}
	InObPa = {'yi', 'yak', 'yam', 'yas', 'yaɣ', 'yawen', 'yawent', 'yasen', 'yasent'}
	# Combined set for general lookup, including both StPa groups
	all_particles = CoPa.union(PoPro).union(SpWo).union(StPa).union(StPaSp).union(DePa).union(DiPa).union(FuPa).union(DiObPa).union(InObPa)
	# The set of particles that can be annexed according to Rule 9, now correctly excluding StPaSp
	rule_9_particles = DiObPa.union(InObPa).union(DiPa).union(StPa)
	# The full set of state particles for other rules (like Rule 5 and 11)
	full_stpa_set = StPa.union(StPaSp)
	# Particles that can be part of the chain after FuPa (Rule 11)
	rule_11_particles = DiObPa.union(DiPa).union(full_stpa_set)
	# First, tokenize the text by splitting on spaces and existing dashes
	tokens = re.split(r'[\s-]+', text.lower().strip())
	processed_tokens = []
	i = 0
	while i < len(tokens):
	current_token = tokens[i]
	# --- Rule 11: FuPa followed by an annexation chain tied to a word ---
	if current_token in FuPa:
	chain_start_idx = i + 1
	annexation_chain = []
	j = chain_start_idx
	while j < len(tokens) and tokens[j] in rule_11_particles:
	annexation_chain.append(tokens[j])
	j += 1
	if annexation_chain and j < len(tokens) and tokens[j] not in all_particles and len(tokens[j]) >= 2:
	processed_tokens.append(current_token)
	annexed_part = "-".join(annexation_chain) + "-" + tokens[j]
	processed_tokens.append(annexed_part)
	i = j + 1
	continue
	# --- Rule 5: Annex StPa to SpWo ---
	if current_token in SpWo and i + 1 < len(tokens) and tokens[i + 1] in full_stpa_set:
	annexed_part = f"{current_token}-{tokens[i+1]}"
	processed_tokens.append(annexed_part)
	i += 2
	continue
	# --- Rule 7: Annex DePa to a word of 2+ letters ---
	is_regular_word = current_token not in all_particles
	if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i+1] in DePa:
	processed_tokens.append(f"{current_token}-{tokens[i+1]}")
	i += 2
	continue
	# --- Rule 3: Annex PoPro to a word of 2+ letters ---
	is_regular_word = current_token not in all_particles
	if is_regular_word and len(current_token) >= 2 and i + 1 < len(tokens) and tokens[i+1] in PoPro:
	processed_tokens.append(f"{current_token}-{tokens[i+1]}")
	i += 2
	continue
	# --- Rule 9: Annex a combination of particles to a regular word ---
	is_regular_word_rule9 = current_token not in all_particles
	if is_regular_word_rule9 and len(current_token) >= 2 and i + 1 < len(tokens):
	annexation_chain = []
	j = i + 1
	while j < len(tokens) and tokens[j] in rule_9_particles:
	annexation_chain.append(tokens[j])
	j += 1
	if annexation_chain:
	processed_tokens.append(f"{current_token}-" + "-".join(annexation_chain))
	i = j
	continue
	# --- Handle all other tokens as they are (no annexation) ---
	processed_tokens.append(current_token)
	i += 1
	final_text = " ".join(processed_tokens)
	return final_text

	def transcribe(self, audio_file):
	"""
	Transcribe an audio file and apply Kabyle-specific post-processing.
	Args:
	audio_file (str): Path to the uploaded audio file
	Returns:
	str: Clean, grammatically improved transcription
	"""
	if not os.path.exists(audio_file):
	return "Error: Audio file not found."

	try:
	# Transcribe using NeMo
	with torch.no_grad():
	result = self.model.transcribe([audio_file], batch_size=1, num_workers=0)

	# Extract text from Hypothesis object
	hypothesis = result[0]
	if hasattr(hypothesis, 'text'):
	raw_text = hypothesis.text.strip()
	else:
	raw_text = str(hypothesis).strip()

	if not raw_text:
	return "Transcription returned no text."

	# Apply Kabyle grammar post-processing
	final_text = self.post_process_kabyle_text(raw_text)
	return final_text

	except Exception as e:
	return f"Transcription error: {str(e)}"