Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
import numpy as np | |
from scipy.io.wavfile import write | |
import os | |
print("Initializing models...") | |
# Initialize lyrics generation model (using GPT-2 as an example) | |
lyrics_model_name = "gpt2" # You can use a fine-tuned model specific to lyrics | |
lyrics_tokenizer = AutoTokenizer.from_pretrained(lyrics_model_name) | |
lyrics_model = AutoModelForCausalLM.from_pretrained(lyrics_model_name) | |
lyrics_generator = pipeline("text-generation", model=lyrics_model, tokenizer=lyrics_tokenizer) | |
# Initialize Bark for vocals and music generation | |
from transformers import BarkModel, BarkProcessor | |
print("Loading Bark model...") | |
bark_processor = BarkProcessor.from_pretrained("suno/bark") | |
bark_model = BarkModel.from_pretrained("suno/bark") | |
# Move to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
bark_model = bark_model.to(device) | |
def generate_lyrics(prompt, max_length=150): | |
"""Generate song lyrics based on the input prompt""" | |
# Add specific instructions to guide the model to generate lyrics | |
enhanced_prompt = f"Write song lyrics about {prompt}. Include a verse and chorus structure:" | |
# Generate lyrics using the model | |
generated = lyrics_generator( | |
enhanced_prompt, | |
max_length=max_length, | |
num_return_sequences=1, | |
temperature=0.9, | |
top_k=50, | |
top_p=0.95 | |
) | |
# Extract lyrics from generation | |
lyrics = generated[0]['generated_text'].replace(enhanced_prompt, "").strip() | |
return lyrics | |
def generate_vocals(lyrics, voice_preset="v2/en_speaker_6"): | |
"""Generate vocals using Bark""" | |
print(f"Generating vocals with lyrics: {lyrics[:50]}...") | |
# Process text for better vocal generation by adding musical notation | |
vocals_text = f"♪ {lyrics} ♪" | |
inputs = bark_processor(text=vocals_text, voice_preset=voice_preset) | |
audio_array = bark_model.generate(**inputs) | |
audio_array = audio_array.cpu().numpy().squeeze() | |
# Convert to proper audio format | |
sample_rate = 24000 # Bark's output sample rate | |
# Save temporarily and return path | |
os.makedirs("outputs", exist_ok=True) | |
output_path = "outputs/vocals.wav" | |
write(output_path, sample_rate, audio_array) | |
return output_path, sample_rate, audio_array | |
def generate_simple_music(prompt, voice_preset="v2/en_speaker_9"): | |
"""Generate simple music using Bark's capability to create singing/humming""" | |
print(f"Generating music for theme: {prompt}...") | |
# Create a prompt that instructs Bark to generate instrumental sounds | |
music_text = f"[music: {prompt}, instrumental, background music without lyrics] ♪ hmm hmm hmm ♪" | |
inputs = bark_processor(text=music_text, voice_preset=voice_preset) | |
audio_array = bark_model.generate(**inputs) | |
audio_array = audio_array.cpu().numpy().squeeze() | |
# Convert to proper audio format | |
sample_rate = 24000 # Bark's output sample rate | |
# Save temporarily and return path | |
os.makedirs("outputs", exist_ok=True) | |
output_path = "outputs/music.wav" | |
write(output_path, sample_rate, audio_array) | |
return output_path, sample_rate, audio_array | |
def mix_audio(vocals_data, music_data, vocals_volume=0.7, music_volume=0.4): | |
"""Combine vocals and music with basic mixing""" | |
vocals_path, vocals_sr, vocals_array = vocals_data | |
music_path, music_sr, music_array = music_data | |
# Adjust length - make sure both are the same length by padding or truncating | |
max_length = max(len(vocals_array), len(music_array)) | |
if len(vocals_array) < max_length: | |
vocals_array = np.pad(vocals_array, (0, max_length - len(vocals_array))) | |
if len(music_array) < max_length: | |
music_array = np.pad(music_array, (0, max_length - len(music_array))) | |
else: | |
# Truncate music if too long | |
music_array = music_array[:max_length] | |
# Mix - make vocals louder than music | |
mixed_audio = vocals_volume * vocals_array + music_volume * music_array | |
# Normalize | |
mixed_audio = mixed_audio / np.max(np.abs(mixed_audio)) * 0.9 | |
# Save final mix | |
os.makedirs("outputs", exist_ok=True) | |
output_path = "outputs/final_song.wav" | |
write(output_path, vocals_sr, mixed_audio) | |
return output_path | |
def text_to_song(prompt, voice_selection): | |
"""Main function to convert text prompt to a song""" | |
print(f"Processing prompt: {prompt}") | |
# Set the voice based on selection | |
voice_presets = { | |
"Female Singer": "v2/en_speaker_6", | |
"Male Singer": "v2/en_speaker_5", | |
"Female Alto": "v2/en_speaker_9", | |
"Male Baritone": "v2/en_speaker_0" | |
} | |
selected_voice = voice_presets.get(voice_selection, "v2/en_speaker_6") | |
# Step 1: Generate lyrics | |
lyrics = generate_lyrics(prompt) | |
# Step 2: Generate vocals | |
vocals_data = generate_vocals(lyrics, voice_preset=selected_voice) | |
# Step 3: Generate simple music using Bark | |
music_data = generate_simple_music(prompt) | |
# Step 4: Mix vocals and music | |
final_song_path = mix_audio(vocals_data, music_data) | |
return lyrics, final_song_path | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Text to Song Generation App") | |
gr.Markdown("Enter a prompt describing the song you want to generate") | |
with gr.Row(): | |
prompt_input = gr.Textbox( | |
label="Prompt", | |
placeholder="Enter a description for your song...", | |
value="a love song about summer" | |
) | |
voice_selection = gr.Dropdown( | |
choices=["Female Singer", "Male Singer", "Female Alto", "Male Baritone"], | |
label="Select Voice", | |
value="Female Singer" | |
) | |
generate_button = gr.Button("Generate Song") | |
with gr.Row(): | |
lyrics_output = gr.Textbox(label="Generated Lyrics") | |
with gr.Row(): | |
audio_output = gr.Audio(label="Generated Song") | |
generate_button.click( | |
fn=text_to_song, | |
inputs=[prompt_input, voice_selection], | |
outputs=[lyrics_output, audio_output] | |
) | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["a heartfelt country ballad about lost love", "Male Singer"], | |
["an upbeat pop song about friendship", "Female Singer"], | |
["a rock anthem about overcoming challenges", "Male Baritone"] | |
], | |
inputs=[prompt_input, voice_selection] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |