Spaces:

teoha
/

holosubs_inference_endpoint_1

Sleeping

App Files Files Community

teoha commited on Oct 15, 2023

Commit

e953bef

1 Parent(s): 71cd364

Migrated away business logic and replaced with fastpi+celery API endpoint

Browse files

Files changed (7) hide show

.env +0 -2
Dockerfile +0 -10
holosubs.py +0 -119
main.py +19 -9
requirements.txt +3 -16
transcribe.py +0 -78
youtubeaudio.py +0 -51

.env DELETED Viewed

	@@ -1,2 +0,0 @@
1	- peft_model_id ="teoha/openai-whisper-medium-LORA-ja"
2	- install_location = "/tmp/elite_understanding"

Dockerfile CHANGED Viewed

@@ -1,17 +1,7 @@
 FROM pytorch/pytorch
 WORKDIR /code
-RUN mkdir /.cache
-RUN chmod 1777 /.cache
 COPY ./requirements.txt /code/requirements.txt
-RUN echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/libcudart.so' >> ~/.bashrc
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-RUN /opt/conda/bin/pip install peft
-RUN /opt/conda/bin/pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
-# Expose the secret SECRET_EXAMPLE at buildtime and use its value as git remote URL
-RUN --mount=type=secret,id=HUGGINGFACE_TOKEN,mode=0444,required=true \
- huggingface-cli login --token $(cat /run/secrets/HUGGINGFACE_TOKEN) && \
- echo "HUGGINGFACE_TOKEN=$( cat /run/secrets/HUGGINGFACE_TOKEN )" >> .env
 COPY . .

 FROM pytorch/pytorch
 WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
 COPY . .

holosubs.py DELETED Viewed

@@ -1,119 +0,0 @@
-""""
-Entry point and main execution block of the video transcription job
-"""
-import re
-from dotenv import load_dotenv
-from youtubeaudio import YoutubeAudio
-from transcribe import Transcriber
-import torchaudio
-from pyannote.audio import Pipeline
-from webvtt import WebVTT, Caption
-import torch
-import logging
-from huggingface_hub._login import _login
-import os
-load_dotenv()
-WHISPER_SAMPLE_RATE=16000
-TIMESTAMP_PATTERN='[0-9]+:[0-9]+:[0-9]+\.[0-9]+'
-MAX_CHUNK_DURATION=30000 # ms
-format = "%(asctime)s: %(message)s"
-logging.basicConfig(format=format, level=logging.DEBUG,
-                    datefmt="%H:%M:%S")
-_login(token=os.getenv('HUGGINGFACE_TOKEN'), add_to_git_credential=False)
-def get_video_vtt(url) -> str:
-    # Download wav file
-    ytaudio=YoutubeAudio(url)
-    ytaudio.download_audio()
-    # Load audio
-    audio, sample_rate = torchaudio.load(ytaudio.filename)
-    audio_dict={"waveform": audio, "sample_rate": sample_rate}
-    # Diarization
-    pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization@2.1', use_auth_token=True)
-    dzs = pipeline(audio_dict)
-    groups = group_segments(str(dzs).splitlines())
-    # Preprocess audio segments for translation
-    audio = torchaudio.functional.resample(audio, orig_freq=sample_rate, new_freq=WHISPER_SAMPLE_RATE)
-    audio_segments, timestamps = get_segments(groups, audio)
-    # Decoding audio segments into subtitles
-    transcriber = Transcriber(task="translate")
-    captions = decode_segments(audio_segments, timestamps, transcriber)
-    vtt = create_vtt(captions)
-    ytaudio.clean()
-    return vtt.content
-def decode_segments(audio_segments, timestamps, transcriber):
-  captions = []
-  for i, segment in enumerate(audio_segments):
-    result = transcriber.decode(segment)
-    captions.append(Caption(timestamps[i][0], timestamps[i][1], result))
-    logging.info(f"Chunk output no.{i+1}: {result}")
-  return captions
-def millisec(timeStr):
-  spl = timeStr.split(":")
-  s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
-  return s
-def group_segments(dzs):
-  groups = []
-  g = []
-  lastend = 0
-  for d in dzs:
-    if g and (g[0].split()[-1] != d.split()[-1]):      #same speaker
-      groups.append(g)
-      g = []
-    g.append(d)
-    end = re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=d)[1]
-    end = millisec(end)
-    if (lastend > end):       #segment engulfed by a previous segment
-      groups.append(g)
-      g = []
-    else:
-      lastend = end
-  if g:
-    groups.append(g)
-  logging.debug(groups)
-  return groups
-def create_vtt(captions):
-  vtt = WebVTT()
-  for caption in captions:
-    vtt.captions.append(caption)
-  return vtt
-  # vtt.save(path)
-def get_segments(groups, audio):
-  monoaudio=torch.mean(input=audio,dim=0).numpy()
-  audio_segments = []
-  timestamps = []
-  for g in groups:
-    cur_start_time, cur_end_time = re.findall(TIMESTAMP_PATTERN, string=g[0])
-    cur_start_millisec = millisec(cur_start_time) #- spacermilli
-    cur_end_millisec = millisec(cur_end_time)  #- spacermilli
-    for window in g[1:]:
-      start_time, end_time = re.findall(TIMESTAMP_PATTERN, string=window)
-      start_millisec = millisec(start_time) #- spacermilli
-      end_millisec = millisec(end_time)  #- spacermilli
-      # Check if new window exceeds chunk size
-      seg_duration_with_window=end_millisec-cur_start_millisec
-      if seg_duration_with_window>MAX_CHUNK_DURATION: # Segment with window exceeds max chunk duration
-        start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
-        audio_segments.append(monoaudio[start_frame:end_frame])
-        timestamps.append((cur_start_time, cur_end_time))
-        cur_start_time, cur_end_time = start_time, end_time
-        cur_start_millisec, cur_end_millisec = start_millisec, end_millisec
-      else:
-        cur_end_time=end_time
-        cur_end_millisec=end_millisec
-    # Final update
-    start_frame, end_frame = cur_start_millisec*WHISPER_SAMPLE_RATE//1000, cur_end_millisec*WHISPER_SAMPLE_RATE//1000
-    audio_segments.append(monoaudio[start_frame:end_frame])
-    timestamps.append((cur_start_time, cur_end_time))
-  return audio_segments, timestamps

main.py CHANGED Viewed

@@ -1,14 +1,24 @@
 from fastapi import FastAPI
-from holosubs import get_video_vtt
-from pydantic import BaseModel
-class Url(BaseModel):
-    url: str
-app = FastAPI()
-@app.post("/captions/")
-def read_root(url: Url):
-    vtt_captions = get_video_vtt(url.url)
-    return {"captions": vtt_captions}

 from fastapi import FastAPI
+from celery import Celery
+app = FastAPI()
+BROKER_URL = 'redis://139.59.127.180:6379/0'
+BACKEND_URL = 'redis://139.59.127.180:6379/0'
+celery = Celery(
+    __name__,
+    broker=BROKER_URL,
+    backend=BACKEND_URL
+)
+@app.get("/")
+async def root():
+    return {"message": "Hello World"}
+@celery.task
+def divide(x, y):
+    import time
+    time.sleep(5)
+    return x / y

requirements.txt CHANGED Viewed

@@ -1,16 +1,3 @@
-fastapi==0.74.*
-requests==2.27.*
-sentencepiece==0.1.*
-uvicorn[standard]==0.17.*
-numpy==1.24.4
-pyannote.audio==1.1.2
-pyannote.core==5.0.0
-pyannote.database==5.0.1
-pyannote.metrics==3.2.1
-pyannote.pipeline==1.5.2
-python-dotenv==1.0.0
-torch==2.0.1
-torchaudio==2.0.2
-transformers==4.31.0
-webvtt_py==0.4.6
-yt_dlp==2023.7.6

+celery==5.1.2
+fastapi==0.103.2
+pydantic==1.10.12

transcribe.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""
-Represents a model that transcribes and translates audio.
-"""
-import logging
-import os
-from typing import Union
-import numpy as np
-import torch
-from dotenv import load_dotenv
-from peft import PeftConfig, PeftModel
-from transformers import (AutomaticSpeechRecognitionPipeline,
-                          WhisperForConditionalGeneration, WhisperProcessor,
-                          WhisperTokenizer)
-load_dotenv()
-format = "%(asctime)s: %(message)s"
-logging.basicConfig(format=format, level=logging.DEBUG,
-                    datefmt="%H:%M:%S")
-class Transcriber:
-    def __init__(self, model_id="teoha/openai-whisper-medium-LORA-ja", language="Japanese", task="translate"):
-        self.language=language
-        self.task=task
-        peft_model_id = model_id if model_id else os.getenv('peft_model_id')
-        # TODO: Fix Download and install model locally
-        # self.install_model(peft_model_id)
-        self.initialize_pipe(peft_model_id) #initialize pipe
-    def install_model(self, peft_model_id:str) -> None:
-            save_location = os.path.join(os.getenv('install_location'), peft_model_id)
-            offload_location = os.path.join(os.getenv('install_location'), "offload")
-            #Save Model
-            peft_config = PeftConfig.from_pretrained(peft_model_id)
-            model = WhisperForConditionalGeneration.from_pretrained(
-                peft_config.base_model_name_or_path,
-                load_in_8bit=False, device_map="auto"
-            )
-            model = PeftModel.from_pretrained(model, peft_model_id, offload_folder="offload_location")
-            model.save_pretrained(save_location)
-            #Save tokenizer/processor
-            tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-            processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-            tokenizer.save_pretrained(save_location)
-            processor.save_pretrained(save_location)
-            logging.info("Installation Completed successfully")
-    def initialize_pipe(self, peft_model_id: str) -> None:
-        offload_location = os.path.join(os.getenv('install_location'), "offload")
-        # Initalize model configs
-        peft_config = PeftConfig.from_pretrained(peft_model_id)
-        model = WhisperForConditionalGeneration.from_pretrained(peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto")
-        model = PeftModel.from_pretrained(model, peft_model_id, offload_folder=offload_location)
-        tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-        processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=self.language, task=self.task)
-        feature_extractor = processor.feature_extractor
-        # Initialize class variables
-        self.forced_decoder_ids = processor.get_decoder_prompt_ids(language=self.language, task=self.task)
-        self.pipe = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
-        logging.info("Pipe successfully initialized")
-    def decode(self, audio: Union[np.ndarray, bytes, str]) -> str:
-        '''
-        Transcribes a sequence of floats representing an audio snippet.
-        Args:
-            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
-                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
-                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
-                the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
-                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
-                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
-        '''
-        with torch.cuda.amp.autocast():
-            text = self.pipe(audio, generate_kwargs={"forced_decoder_ids": self.forced_decoder_ids})["text"]
-            return text

youtubeaudio.py DELETED Viewed

@@ -1,51 +0,0 @@
-"""
-Represents a Youtube video
-"""
-from dotenv import load_dotenv
-import logging
-from yt_dlp import YoutubeDL
-import os
-from pathlib import Path
-load_dotenv()
-format = "%(asctime)s: %(message)s"
-logging.basicConfig(format=format, level=logging.DEBUG,
-                    datefmt="%H:%M:%S")
-class YoutubeAudio:
-    def __init__(self, url, dir="/tmp/holosubs/audio"):
-        self.url=url
-        self.dir=dir
-    def download_audio(self):
-        ydl_opts = {
-            'outtmpl': os.path.join(self.dir, "%(id)s_%(epoch)s.%(ext)s"),
-            'logger': logging,
-            'progress_hooks': [self.progress_hook],
-            'format': 'm4a/bestaudio/best',
-            'postprocessors': [{  # Extract audio using ffmpeg
-                'key': 'FFmpegExtractAudio',
-                'preferredcodec': 'wav',
-            }]
-        }
-        with YoutubeDL(ydl_opts) as ydl:
-            error_code = ydl.download([self.url])
-    def clean(self):
-        if not self.filename:
-            logging.error("Audio not downloaded")
-            return
-        location=os.path.join(self.dir, self.filename)
-        if os.path.exists(self.filename):
-            os.remove(self.filename)
-            logging.info(f"File {self.filename} successfully removed")
-            self.filename=None
-        else:
-            print(f"File {self.filename} does not exist")
-    def progress_hook(self, d):
-        if d['status'] == 'finished':
-            self.filename=os.path.join(self.dir, Path(d.get('info_dict').get('_filename')).stem + ".wav")
-            print(f'Done downloading {self.filename}, now post-processing ...')