|
import gradio as gr |
|
import requests |
|
import os |
|
from typing import Optional |
|
|
|
|
|
API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b" |
|
|
|
def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str: |
|
""" |
|
Query the Hugging Face Inference API for speech transcription |
|
""" |
|
headers = {} |
|
if hf_token: |
|
headers["Authorization"] = f"Bearer {hf_token}" |
|
|
|
try: |
|
with open(audio_file_path, "rb") as f: |
|
data = f.read() |
|
|
|
response = requests.post(API_URL, headers=headers, data=data, timeout=60) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
if isinstance(result, dict) and 'text' in result: |
|
return result['text'] |
|
elif isinstance(result, list) and len(result) > 0: |
|
return result[0].get('generated_text', str(result)) |
|
else: |
|
return str(result) |
|
else: |
|
return f"API Error {response.status_code}: {response.text}" |
|
|
|
except requests.exceptions.Timeout: |
|
return "β Request timed out. The model might be loading. Please try again in a few minutes." |
|
except Exception as e: |
|
return f"β Error: {str(e)}" |
|
|
|
def transcribe_with_local_processing(audio_file_path: str) -> str: |
|
""" |
|
Fallback: Simple local audio processing without heavy models |
|
""" |
|
try: |
|
import soundfile as sf |
|
|
|
|
|
data, samplerate = sf.read(audio_file_path) |
|
duration = len(data) / samplerate |
|
|
|
return f""" |
|
π **Audio File Analysis:** |
|
- Duration: {duration:.2f} seconds |
|
- Sample Rate: {samplerate} Hz |
|
- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'} |
|
|
|
β οΈ **For actual transcription**: |
|
This demo shows the file was processed successfully. |
|
For full transcription, you would need: |
|
1. A Hugging Face token (free to get) |
|
2. Or run this on hardware with more resources |
|
|
|
The Granite Speech 3.3-2B model supports: |
|
- English, French, German, Spanish, Portuguese |
|
- Speech-to-text transcription |
|
- Speech translation to English |
|
""" |
|
|
|
except Exception as e: |
|
return f"β Error processing audio: {str(e)}" |
|
|
|
def process_audio(audio_file, hf_token): |
|
"""Main processing function""" |
|
if audio_file is None: |
|
return "β Please upload an audio file." |
|
|
|
|
|
if hf_token and hf_token.strip(): |
|
result = query_inference_api(audio_file, hf_token.strip()) |
|
if not result.startswith("β"): |
|
return f"π€ **Transcription Result:**\n\n{result}" |
|
|
|
|
|
return transcribe_with_local_processing(audio_file) |
|
|
|
def create_interface(): |
|
"""Create the Gradio interface""" |
|
|
|
with gr.Blocks( |
|
title="Granite Speech Demo", |
|
theme=gr.themes.Soft(), |
|
css="footer {visibility: hidden}" |
|
) as demo: |
|
|
|
gr.Markdown(""" |
|
# π€ IBM Granite Speech 3.3-2B Demo |
|
|
|
**Two ways to use this demo:** |
|
1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens) |
|
2. **Without Token**: Basic audio file analysis |
|
|
|
**Supported Languages**: English, French, German, Spanish, Portuguese |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
|
|
hf_token = gr.Textbox( |
|
label="π Hugging Face Token (Optional)", |
|
placeholder="hf_xxx... (get from huggingface.co/settings/tokens)", |
|
type="password", |
|
info="Paste your free HF token for full transcription" |
|
) |
|
|
|
|
|
audio_input = gr.Audio( |
|
label="π Upload Audio File", |
|
type="filepath", |
|
format="wav" |
|
) |
|
|
|
|
|
process_btn = gr.Button("π― Process Audio", variant="primary", size="lg") |
|
|
|
|
|
gr.Markdown(""" |
|
### π‘ Tips: |
|
- **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) β "New token" β "Read" access |
|
- **Audio format**: WAV, MP3, M4A supported |
|
- **Length**: Keep under 1 minute for best results |
|
- **Quality**: Clear speech works best |
|
""") |
|
|
|
with gr.Column(scale=2): |
|
|
|
output = gr.Textbox( |
|
label="π Results", |
|
lines=12, |
|
interactive=False, |
|
placeholder="Upload audio and click 'Process Audio' to see transcription..." |
|
) |
|
|
|
|
|
process_btn.click( |
|
fn=process_audio, |
|
inputs=[audio_input, hf_token], |
|
outputs=output |
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
--- |
|
**About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition. |
|
Model supports multilingual transcription and translation capabilities. |
|
""") |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch(server_name="0.0.0.0", server_port=7860) |