Spaces:

oyemade
/

yoruba-to-english-speech

Sleeping

App Files Files Community

yoruba-to-english-speech / app.py

oyemade

Update app.py

aebdd16 verified 19 days ago

raw

history blame contribute delete

2.17 kB

	import spaces
	import gradio as gr
	import numpy as np
	import torch

	from datasets import load_dataset
	from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline

	device = "cuda:0" if torch.cuda.is_available() else "cpu"


	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-colab-CV16.1", device=device)


	# load text-to-speech checkpoint and speaker embeddings
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", revision="refs/convert/parquet")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	translation_model = pipeline("translation", "facebook/nllb-200-distilled-600M", src_lang="yor_Latn", tgt_lang="eng_Latn", device=device)


	def translate(audio):
	text = asr_pipe(audio)["text"]
	# print(text)
	translation = translation_model(text)
	# print(translation[0]['translation_text'])
	return translation[0]['translation_text']

	def synthesise(text):
	inputs = processor(text=text, return_tensors="pt")
	speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
	return speech.cpu()

	@spaces.GPU
	def speech_to_speech_translation(audio):
	# print(model)
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech

	iface = gr.Interface(
	speech_to_speech_translation,
	gr.Audio(sources=["microphone", "upload"], type="filepath"),
	gr.Audio(label="Generated Speech", type="numpy"),
	title="Neoform AI: Yoruba Speech to English Speech",
	description="Demo for Yoruba speech translated to English Speech. NOTE: If you get an ERROR after pressing submit, give the audio some secs to load then try again.",
	)

	iface.launch()