claude / app.py
balaharan's picture
requirements.txt
5bc7dec verified
import gradio as gr
import requests
import os
from typing import Optional
# Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"
def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
"""
Query the Hugging Face Inference API for speech transcription
"""
headers = {}
if hf_token:
headers["Authorization"] = f"Bearer {hf_token}"
try:
with open(audio_file_path, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data, timeout=60)
if response.status_code == 200:
result = response.json()
if isinstance(result, dict) and 'text' in result:
return result['text']
elif isinstance(result, list) and len(result) > 0:
return result[0].get('generated_text', str(result))
else:
return str(result)
else:
return f"API Error {response.status_code}: {response.text}"
except requests.exceptions.Timeout:
return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
except Exception as e:
return f"❌ Error: {str(e)}"
def transcribe_with_local_processing(audio_file_path: str) -> str:
"""
Fallback: Simple local audio processing without heavy models
"""
try:
import soundfile as sf
# Read audio file info
data, samplerate = sf.read(audio_file_path)
duration = len(data) / samplerate
return f"""
πŸ“Š **Audio File Analysis:**
- Duration: {duration:.2f} seconds
- Sample Rate: {samplerate} Hz
- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}
⚠️ **For actual transcription**:
This demo shows the file was processed successfully.
For full transcription, you would need:
1. A Hugging Face token (free to get)
2. Or run this on hardware with more resources
The Granite Speech 3.3-2B model supports:
- English, French, German, Spanish, Portuguese
- Speech-to-text transcription
- Speech translation to English
"""
except Exception as e:
return f"❌ Error processing audio: {str(e)}"
def process_audio(audio_file, hf_token):
"""Main processing function"""
if audio_file is None:
return "❌ Please upload an audio file."
# Try Inference API first if token provided
if hf_token and hf_token.strip():
result = query_inference_api(audio_file, hf_token.strip())
if not result.startswith("❌"):
return f"🎀 **Transcription Result:**\n\n{result}"
# Fallback to local processing
return transcribe_with_local_processing(audio_file)
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(
title="Granite Speech Demo",
theme=gr.themes.Soft(),
css="footer {visibility: hidden}"
) as demo:
gr.Markdown("""
# 🎀 IBM Granite Speech 3.3-2B Demo
**Two ways to use this demo:**
1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
2. **Without Token**: Basic audio file analysis
**Supported Languages**: English, French, German, Spanish, Portuguese
""")
with gr.Row():
with gr.Column(scale=1):
# Token input
hf_token = gr.Textbox(
label="πŸ”‘ Hugging Face Token (Optional)",
placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
type="password",
info="Paste your free HF token for full transcription"
)
# Audio input
audio_input = gr.Audio(
label="πŸ“ Upload Audio File",
type="filepath",
format="wav"
)
# Process button
process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
# Example info
gr.Markdown("""
### πŸ’‘ Tips:
- **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) β†’ "New token" β†’ "Read" access
- **Audio format**: WAV, MP3, M4A supported
- **Length**: Keep under 1 minute for best results
- **Quality**: Clear speech works best
""")
with gr.Column(scale=2):
# Output
output = gr.Textbox(
label="πŸ“ Results",
lines=12,
interactive=False,
placeholder="Upload audio and click 'Process Audio' to see transcription..."
)
# Event handler
process_btn.click(
fn=process_audio,
inputs=[audio_input, hf_token],
outputs=output
)
# Footer info
gr.Markdown("""
---
**About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
Model supports multilingual transcription and translation capabilities.
""")
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860)