Spaces:

balaharan
/

claude

Sleeping

App Files Files Community

claude / app.py

balaharan

requirements.txt

5bc7dec verified 11 days ago

raw

history blame contribute delete

5.6 kB

	import gradio as gr
	import requests
	import os
	from typing import Optional

	# Hugging Face Inference API
	API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"

	def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
	"""
	Query the Hugging Face Inference API for speech transcription
	"""
	headers = {}
	if hf_token:
	headers["Authorization"] = f"Bearer {hf_token}"

	try:
	with open(audio_file_path, "rb") as f:
	data = f.read()

	response = requests.post(API_URL, headers=headers, data=data, timeout=60)

	if response.status_code == 200:
	result = response.json()
	if isinstance(result, dict) and 'text' in result:
	return result['text']
	elif isinstance(result, list) and len(result) > 0:
	return result[0].get('generated_text', str(result))
	else:
	return str(result)
	else:
	return f"API Error {response.status_code}: {response.text}"

	except requests.exceptions.Timeout:
	return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
	except Exception as e:
	return f"❌ Error: {str(e)}"

	def transcribe_with_local_processing(audio_file_path: str) -> str:
	"""
	Fallback: Simple local audio processing without heavy models
	"""
	try:
	import soundfile as sf

	# Read audio file info
	data, samplerate = sf.read(audio_file_path)
	duration = len(data) / samplerate

	return f"""
	📊 Audio File Analysis:
	- Duration: {duration:.2f} seconds
	- Sample Rate: {samplerate} Hz
	- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}

	⚠️ For actual transcription:
	This demo shows the file was processed successfully.
	For full transcription, you would need:
	1. A Hugging Face token (free to get)
	2. Or run this on hardware with more resources

	The Granite Speech 3.3-2B model supports:
	- English, French, German, Spanish, Portuguese
	- Speech-to-text transcription
	- Speech translation to English
	"""

	except Exception as e:
	return f"❌ Error processing audio: {str(e)}"

	def process_audio(audio_file, hf_token):
	"""Main processing function"""
	if audio_file is None:
	return "❌ Please upload an audio file."

	# Try Inference API first if token provided
	if hf_token and hf_token.strip():
	result = query_inference_api(audio_file, hf_token.strip())
	if not result.startswith("❌"):
	return f"🎤 Transcription Result:\n\n{result}"

	# Fallback to local processing
	return transcribe_with_local_processing(audio_file)

	def create_interface():
	"""Create the Gradio interface"""

	with gr.Blocks(
	title="Granite Speech Demo",
	theme=gr.themes.Soft(),
	css="footer {visibility: hidden}"
	) as demo:

	gr.Markdown("""
	# 🎤 IBM Granite Speech 3.3-2B Demo

	Two ways to use this demo:
	1. With HF Token (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
	2. Without Token: Basic audio file analysis

	Supported Languages: English, French, German, Spanish, Portuguese
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# Token input
	hf_token = gr.Textbox(
	label="🔑 Hugging Face Token (Optional)",
	placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
	type="password",
	info="Paste your free HF token for full transcription"
	)

	# Audio input
	audio_input = gr.Audio(
	label="📁 Upload Audio File",
	type="filepath",
	format="wav"
	)

	# Process button
	process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")

	# Example info
	gr.Markdown("""
	### 💡 Tips:
	- Get HF Token: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → "New token" → "Read" access
	- Audio format: WAV, MP3, M4A supported
	- Length: Keep under 1 minute for best results
	- Quality: Clear speech works best
	""")

	with gr.Column(scale=2):
	# Output
	output = gr.Textbox(
	label="📝 Results",
	lines=12,
	interactive=False,
	placeholder="Upload audio and click 'Process Audio' to see transcription..."
	)

	# Event handler
	process_btn.click(
	fn=process_audio,
	inputs=[audio_input, hf_token],
	outputs=output
	)

	# Footer info
	gr.Markdown("""
	---
	About: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
	Model supports multilingual transcription and translation capabilities.
	""")

	return demo

	# Launch the app
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860)