ahohpotato commited on
Commit
a4d5e9e
·
verified ·
1 Parent(s): 5240029

Upload 8 files

Browse files
Files changed (8) hide show
  1. QA.py +81 -0
  2. README.md +134 -12
  3. audio.py +25 -0
  4. captions.py +125 -0
  5. main.py +601 -0
  6. models.py +78 -0
  7. packages.txt +2 -0
  8. processing.py +127 -0
QA.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def answer_question(question, context, models):
4
+ """Answer question based on context using a Gemma-style LLM"""
5
+
6
+ try:
7
+ prompt = f"""Based on the following video analysis data, please answer the question.
8
+
9
+ Video Captions and Transcription:
10
+ {context}
11
+
12
+ Question: {question}
13
+
14
+ Please provide a clear and concise answer based only on the information provided above."""
15
+
16
+ messages = [{"role": "user", "content": prompt}]
17
+
18
+ text = models['qa_tokenizer'].apply_chat_template(
19
+ messages,
20
+ tokenize=False,
21
+ add_generation_prompt=True,
22
+ enable_thinking=True
23
+ )
24
+ model_inputs = models['qa_tokenizer'](
25
+ [text],
26
+ return_tensors="pt"
27
+ ).to(models['qa_model'].device)
28
+
29
+ with torch.no_grad():
30
+ generated_ids = models['qa_model'].generate(
31
+ **model_inputs,
32
+ max_new_tokens=32768,
33
+ # do_sample=False,
34
+ # temperature=0.7,
35
+ # eos_token_id=models['qa_tokenizer'].eos_token_id
36
+ )
37
+
38
+ # Extract the new tokens after the input prompt
39
+ input_length = model_inputs.input_ids.shape[-1]
40
+ output_ids = generated_ids[0][input_length:].tolist()
41
+ try:
42
+ index = len(output_ids) - output_ids[::-1].index(151668)
43
+ except ValueError:
44
+ index = 0
45
+
46
+ answer = models['qa_tokenizer'].decode(output_ids[index:], skip_special_tokens=True).strip("\n")
47
+
48
+ return answer
49
+
50
+ except Exception as e:
51
+ return f"Error generating answer: {e}"
52
+
53
+
54
+ def get_context_for_qa(session_id, conn):
55
+ """Retrieve all captions and transcriptions for QA context"""
56
+ cursor = conn.cursor()
57
+
58
+ # Get captions
59
+ cursor.execute(
60
+ "SELECT timestamp, caption FROM captions WHERE session_id = ? ORDER BY timestamp",
61
+ (session_id,)
62
+ )
63
+ captions = cursor.fetchall()
64
+
65
+ # Get transcription
66
+ cursor.execute(
67
+ "SELECT transcription FROM transcriptions WHERE session_id = ?",
68
+ (session_id,)
69
+ )
70
+ transcription_result = cursor.fetchone()
71
+
72
+ context = "CAPTIONS:\n"
73
+ for timestamp, caption in captions:
74
+ context += f"At {timestamp:.1f}s: {caption}\n"
75
+
76
+ if transcription_result:
77
+ context += f"\nAUDIO TRANSCRIPTION:\n{transcription_result[0]}"
78
+
79
+ # print(context)
80
+
81
+ return context
README.md CHANGED
@@ -1,12 +1,134 @@
1
- ---
2
- title: Video Analysis Qa System
3
- emoji: 📉
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎥 Video Analysis QA System
2
+
3
+ An intelligent video analysis system that extracts insights from videos through automated captioning, audio transcription, and natural language question-answering capabilities.
4
+
5
+ ## ✨ Features
6
+
7
+ - **Video Processing**: Upload videos or capture directly from webcam
8
+ - **Frame Analysis**: Automatic extraction and intelligent captioning of video frames
9
+ - **Audio Transcription**: Speech-to-text conversion using advanced AI models
10
+ - **Question Answering**: Natural language queries about video content
11
+ - **Session Management**: Organize and revisit previous video analyses
12
+ - **Real-time Webcam**: Live video capture and processing
13
+
14
+ ## 🚀 Getting Started
15
+
16
+ ### Prerequisites
17
+
18
+ ```bash
19
+ pip install -r requirements.txt
20
+ ```
21
+
22
+ ### Installation
23
+
24
+ 1. Clone the repository
25
+ 2. Install dependencies
26
+ 3. Run the application:
27
+
28
+ ```bash
29
+ streamlit run main.py
30
+ ```
31
+
32
+ ## 🏗️ Architecture
33
+
34
+ The system consists of several modular components:
35
+
36
+ - **`main.py`**: Streamlit web interface and application orchestration
37
+ - **`models.py`**: AI model loading and initialization with caching
38
+ - **`processing.py`**: Video processing pipeline coordinator
39
+ - **`captions.py`**: Frame extraction and image captioning
40
+ - **`audio.py`**: Audio extraction and transcription
41
+ - **`QA.py`**: Question-answering and context retrieval
42
+
43
+ ## 🤖 AI Models Used
44
+
45
+ ### Image Captioning
46
+ - **Model**: [QuadrantTechnologies/qhub-blip-image-captioning-finetuned](https://huggingface.co/quadranttechnologies/qhub-blip-image-captioning-finetuned)
47
+ - **Purpose**: Generate descriptive captions for video frames
48
+
49
+ ### Audio Transcription
50
+ - **Model**: [OpenAI/whisper-medium](https://huggingface.co/openai/whisper-medium)
51
+ - **Purpose**: Convert speech to text from video audio tracks
52
+
53
+ ### Question Answering
54
+ - **Model**: [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)
55
+ - **Purpose**: Answer natural language questions about video content
56
+
57
+ ## 📱 Usage
58
+
59
+ ### Video Input Options
60
+
61
+ 1. **File Upload**: Support for MP4, AVI, MOV, MKV formats
62
+ 2. **Webcam Capture**: Real-time recording with customizable duration and FPS
63
+
64
+ ### Analysis Process
65
+
66
+ 1. **Frame Extraction**: Automatically samples frames at specified intervals
67
+ 2. **Caption Generation**: Creates descriptive text for each frame
68
+ 3. **Audio Processing**: Extracts and transcribes speech content
69
+ 4. **Database Storage**: Stores results for persistent access
70
+
71
+ ### Question Answering
72
+
73
+ Ask natural language questions about your videos:
74
+ - "What objects were visible in the video?"
75
+ - "What was the person doing?"
76
+ - "What did someone say about [topic]?"
77
+
78
+ ## 💾 Data Management
79
+
80
+ - **SQLite Database**: Stores captions, transcriptions, and session data
81
+ - **Session System**: Organize analyses by unique session IDs
82
+ - **Persistent Storage**: Access previous analyses anytime
83
+
84
+ ## 🛠️ Technical Details
85
+
86
+ ### Video Processing
87
+ - Configurable frame sampling intervals
88
+ - Multi-format video support
89
+ - Real-time webcam integration
90
+
91
+ ### AI Pipeline
92
+ - GPU acceleration when available
93
+ - Efficient model caching with Streamlit
94
+ - Batch processing for improved performance
95
+
96
+ ### Database Schema
97
+ - `video_sessions`: Session metadata
98
+ - `captions`: Frame-level descriptions with timestamps
99
+ - `transcriptions`: Full audio transcripts per session
100
+
101
+ ## 🔧 Configuration
102
+
103
+ ### Webcam Settings
104
+ - Adjustable recording duration (3-30 seconds)
105
+ - Configurable frame rate (1-10 FPS)
106
+ - Real-time preview and progress tracking
107
+
108
+ ### Processing Parameters
109
+ - Frame extraction interval (default: 0.5 seconds)
110
+ - Caption generation limits
111
+ - Audio sampling rate (16kHz for Whisper compatibility)
112
+
113
+ ## 🚨 System Requirements
114
+
115
+ - **Python 3.8+**
116
+ - **CUDA-compatible GPU** (optional, for faster processing)
117
+ - **Webcam** (for live capture functionality)
118
+ - **FFmpeg** (for video processing)
119
+
120
+ ## 🤝 Contributing
121
+
122
+ This system is modular and extensible. Key areas for enhancement:
123
+ - Additional video formats
124
+ - More sophisticated AI models
125
+ - Advanced question types
126
+ - Export capabilities
127
+
128
+ ## 📄 License
129
+
130
+ Open source project - see individual model licenses for AI components.
131
+
132
+ ---
133
+
134
+ *Built with Streamlit, PyTorch, and Transformers for seamless video intelligence.*
audio.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+
4
+ def extract_audio(video_path):
5
+ """Extract audio from video file"""
6
+ try:
7
+ # Use librosa to extract audio
8
+ audio, sr = librosa.load(video_path, sr=16000) # Whisper expects 16kHz
9
+ return audio, sr
10
+ except Exception as e:
11
+ print(f"Error extracting audio: {e}")
12
+ return None, None
13
+
14
+ def transcribe_audio(audio, sr, models):
15
+ """Transcribe audio using Whisper"""
16
+ try:
17
+ inputs = models['whisper_processor'](audio, sampling_rate=sr, return_tensors="pt").input_features.to(models['device'])
18
+
19
+ with torch.no_grad():
20
+ pred_ids = models['whisper_model'].generate(inputs)
21
+ transcription = models['whisper_processor'].batch_decode(pred_ids, skip_special_tokens=True)[0]
22
+
23
+ return transcription
24
+ except Exception as e:
25
+ return f"Error transcribing audio: {e}"
captions.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PIL import Image
4
+ import torch
5
+ from transformers import BlipProcessor, BlipForConditionalGeneration
6
+
7
+ def extract_frames(video_path, interval=0.5):
8
+ """Original function - extract frames at fixed interval"""
9
+ return extract_frames_with_fps(video_path, interval=interval)
10
+
11
+ def extract_frames_with_fps(video_path, interval=0.5):
12
+ """Extract frames from video at specified interval (supports FPS control)
13
+
14
+ Args:
15
+ video_path: Path to video file
16
+ interval: Time interval between frames in seconds (1/fps)
17
+
18
+ Returns:
19
+ frames: List of PIL Images
20
+ timestamps: List of timestamp values
21
+ """
22
+ frames = []
23
+ timestamps = []
24
+
25
+ try:
26
+ # Open video
27
+ cap = cv2.VideoCapture(video_path)
28
+ if not cap.isOpened():
29
+ print(f"Error: Could not open video {video_path}")
30
+ return frames, timestamps
31
+
32
+ # Get video properties
33
+ fps = cap.get(cv2.CAP_PROP_FPS)
34
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
35
+ duration = total_frames / fps
36
+
37
+ print(f"Video info: {fps:.2f} FPS, {duration:.2f}s duration, {total_frames} total frames")
38
+ print(f"Extracting frames every {interval:.2f} seconds")
39
+
40
+ frame_interval = int(fps * interval) # Convert time interval to frame interval
41
+ frame_count = 0
42
+
43
+ while True:
44
+ ret, frame = cap.read()
45
+ if not ret:
46
+ break
47
+
48
+ # Extract frame at specified intervals
49
+ if frame_count % frame_interval == 0:
50
+ # Convert BGR to RGB
51
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
52
+
53
+ # Convert to PIL Image
54
+ pil_image = Image.fromarray(frame_rgb)
55
+
56
+ # Calculate timestamp
57
+ timestamp = frame_count / fps
58
+
59
+ frames.append(pil_image)
60
+ timestamps.append(timestamp)
61
+
62
+ if len(frames) % 10 == 0:
63
+ print(f"Extracted {len(frames)} frames...")
64
+
65
+ frame_count += 1
66
+
67
+ cap.release()
68
+ print(f"Extraction complete: {len(frames)} frames extracted")
69
+
70
+ except Exception as e:
71
+ print(f"Error extracting frames: {str(e)}")
72
+
73
+ return frames, timestamps
74
+
75
+ def generate_caption(image, models):
76
+ """Generate caption for a single image using your custom model - FIXED VERSION"""
77
+ try:
78
+ # FIXED: Use the correct processor call with 'images=' parameter like your working original
79
+ inputs = models['caption_processor'](images=image, return_tensors="pt").to(models['device'])
80
+
81
+ with torch.no_grad():
82
+ # FIXED: Use generate with max_new_tokens like your working original
83
+ output_ids = models['caption_model'].generate(**inputs, max_new_tokens=50)
84
+ caption = models['caption_processor'].batch_decode(output_ids, skip_special_tokens=True)[0]
85
+
86
+ return caption
87
+
88
+ except Exception as e:
89
+ print(f"Error generating caption: {str(e)}")
90
+ return f"Error generating caption: {e}"
91
+
92
+ def batch_generate_captions(frames, models, batch_size=4):
93
+ """Generate captions for multiple frames in batches (more efficient)"""
94
+ captions = []
95
+
96
+ try:
97
+ processor = models['caption_processor']
98
+ model = models['caption_model']
99
+ device = models['device']
100
+
101
+ # Process frames in batches
102
+ for i in range(0, len(frames), batch_size):
103
+ batch_frames = frames[i:i + batch_size]
104
+
105
+ # FIXED: Use the correct processor call with 'images=' parameter
106
+ inputs = processor(images=batch_frames, return_tensors="pt").to(device)
107
+
108
+ # Generate captions
109
+ with torch.no_grad():
110
+ # FIXED: Use max_new_tokens instead of max_length for your model
111
+ outputs = model.generate(**inputs, max_new_tokens=50)
112
+
113
+ # Decode captions - FIXED: Use batch_decode like your original
114
+ batch_captions = processor.batch_decode(outputs, skip_special_tokens=True)
115
+
116
+ captions.extend(batch_captions)
117
+ print(f"Generated captions for batch {i//batch_size + 1}/{(len(frames)-1)//batch_size + 1}")
118
+
119
+ except Exception as e:
120
+ print(f"Error in batch caption generation: {str(e)}")
121
+ # Fallback to individual processing using the working method
122
+ for frame in frames:
123
+ captions.append(generate_caption(frame, models))
124
+
125
+ return captions
main.py ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from models import init_database, load_models
3
+ from QA import get_context_for_qa, answer_question
4
+ from processing import process_video
5
+ import os
6
+ import tempfile
7
+ import time
8
+ import sqlite3
9
+ import cv2
10
+ import numpy as np
11
+ from datetime import datetime
12
+ import threading
13
+ import queue
14
+ import io
15
+ from PIL import Image
16
+
17
+ # Global variables
18
+ models = None
19
+ conn = None
20
+ current_session_id = "main_session"
21
+ current_fps_setting = 5
22
+
23
+ def clear_database_for_new_video(session_id, conn):
24
+ """Clear database entries for a specific session (new video)"""
25
+ try:
26
+ cursor = conn.cursor()
27
+ # Clear previous data for this session
28
+ cursor.execute("DELETE FROM captions WHERE session_id = ?", (session_id,))
29
+ cursor.execute("DELETE FROM transcriptions WHERE session_id = ?", (session_id,))
30
+ cursor.execute("DELETE FROM video_sessions WHERE session_id = ?", (session_id,))
31
+ conn.commit()
32
+ print(f"Database cleared for session: {session_id}")
33
+ except Exception as e:
34
+ print(f"Error clearing database: {e}")
35
+
36
+ def process_video_with_fps(video_path, session_id, models, conn, fps):
37
+ """Wrapper for process_video that handles FPS setting"""
38
+ try:
39
+ # Import your processing modules
40
+ from captions import extract_frames_with_fps, generate_caption
41
+ from audio import extract_audio, transcribe_audio
42
+
43
+ # Extract frames with custom FPS
44
+ print(f"Extracting frames at {fps} FPS...")
45
+ interval = 1.0 / fps # Convert FPS to interval
46
+ frames, timestamps = extract_frames_with_fps(video_path, interval=interval)
47
+
48
+ if not frames:
49
+ print("No frames could be extracted from the video.")
50
+ return
51
+
52
+ # Generate captions
53
+ print(f"Generating captions for {len(frames)} frames...")
54
+ cursor = conn.cursor()
55
+
56
+ for i, (frame, timestamp) in enumerate(zip(frames, timestamps)):
57
+ caption = generate_caption(frame, models)
58
+ cursor.execute(
59
+ "INSERT INTO captions (session_id, timestamp, caption) VALUES (?, ?, ?)",
60
+ (session_id, timestamp, caption)
61
+ )
62
+
63
+ # Update status every 10 frames
64
+ if i % 10 == 0:
65
+ print(f"Generating captions... {i+1}/{len(frames)}")
66
+
67
+ conn.commit()
68
+
69
+ # Extract and transcribe audio
70
+ print("Extracting and transcribing audio...")
71
+ audio, sr = extract_audio(video_path)
72
+
73
+ if audio is not None and len(audio) > 0:
74
+ transcription = transcribe_audio(audio, sr, models)
75
+ cursor.execute(
76
+ "INSERT INTO transcriptions (session_id, transcription) VALUES (?, ?)",
77
+ (session_id, transcription)
78
+ )
79
+ conn.commit()
80
+ else:
81
+ print("No audio found in the video or audio extraction failed.")
82
+
83
+ print("Processing complete!")
84
+
85
+ except ImportError:
86
+ # Fallback to original process_video function if custom FPS functions don't exist
87
+ print("Using original process_video function...")
88
+ process_video(video_path, session_id, models, conn)
89
+ except Exception as e:
90
+ print(f"Error processing video: {str(e)}")
91
+
92
+ def initialize_system():
93
+ """Initialize database and load models - NO DATABASE CLEARING HERE"""
94
+ global models, conn
95
+
96
+ # Initialize database (but don't clear it here)
97
+ conn = init_database()
98
+
99
+ # Load models
100
+ models = load_models()
101
+ if models is None:
102
+ raise Exception("Failed to load models. Please check your internet connection and try again.")
103
+
104
+ return "✅ System initialized successfully!"
105
+
106
+ def process_uploaded_video(video_file, fps_setting, progress=gr.Progress()):
107
+ """Process uploaded video file with FPS setting - CLEARS DB FIRST"""
108
+ global models, conn, current_session_id
109
+
110
+ if video_file is None:
111
+ return "❌ Please upload a video file", "", ""
112
+
113
+ if models is None or conn is None:
114
+ return "❌ System not initialized. Please wait for initialization to complete.", "", ""
115
+
116
+ progress(0.05, desc="Clearing previous data...")
117
+
118
+ # CLEAR DATABASE FOR NEW VIDEO
119
+ clear_database_for_new_video(current_session_id, conn)
120
+
121
+ progress(0.1, desc="Processing video...")
122
+
123
+ try:
124
+ # Create fresh session in database
125
+ cursor = conn.cursor()
126
+ cursor.execute(
127
+ "INSERT INTO video_sessions (session_id) VALUES (?)",
128
+ (current_session_id,)
129
+ )
130
+ conn.commit()
131
+
132
+ progress(0.3, desc="Analyzing video content...")
133
+
134
+ # Set global FPS setting for frame extraction
135
+ global current_fps_setting
136
+ current_fps_setting = fps_setting
137
+
138
+ # Process the video (using global FPS setting)
139
+ process_video_with_fps(video_file, current_session_id, models, conn, fps_setting)
140
+
141
+ progress(0.8, desc="Retrieving results...")
142
+
143
+ # Get results
144
+ captions_text, transcription_text = get_analysis_results()
145
+
146
+ progress(1.0, desc="Complete!")
147
+
148
+ return "✅ Video processed successfully!", captions_text, transcription_text
149
+
150
+ except Exception as e:
151
+ return f"❌ Error processing video: {str(e)}", "", ""
152
+
153
+ def capture_webcam_video(duration, fps, progress=gr.Progress()):
154
+ """Capture video from webcam"""
155
+ global models, conn, current_session_id
156
+
157
+ if models is None or conn is None:
158
+ return "❌ System not initialized. Please wait for initialization to complete.", None, gr.Button(visible=False)
159
+
160
+ progress(0.1, desc="Initializing webcam...")
161
+
162
+ try:
163
+ cap = cv2.VideoCapture(0)
164
+ if not cap.isOpened():
165
+ return "❌ Could not open webcam. Please check your camera connection.", None, gr.Button(visible=False)
166
+
167
+ # Set camera properties
168
+ cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
169
+ cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
170
+ actual_fps = cap.get(cv2.CAP_PROP_FPS)
171
+ print(f"Camera FPS: {actual_fps}, Requested: {fps}")
172
+
173
+ # Create temporary video file with better naming
174
+ timestamp = int(time.time())
175
+ video_path = f"temp_webcam_{timestamp}.mp4"
176
+
177
+ # Setup video writer with better codec settings
178
+ height, width = 480, 640
179
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
180
+ out = cv2.VideoWriter(video_path, fourcc, float(fps), (width, height))
181
+
182
+ if not out.isOpened():
183
+ return "❌ Could not initialize video writer.", None, gr.Button(visible=False)
184
+
185
+ start_time = time.time()
186
+ frame_count = 0
187
+ expected_frames = duration * fps
188
+
189
+ progress(0.2, desc=f"Recording for {duration} seconds...")
190
+
191
+ while (time.time() - start_time) < duration:
192
+ ret, frame = cap.read()
193
+ if not ret:
194
+ print("Failed to capture frame")
195
+ break
196
+
197
+ # Resize frame to ensure consistent size
198
+ frame = cv2.resize(frame, (width, height))
199
+ out.write(frame)
200
+ frame_count += 1
201
+
202
+ # Update progress
203
+ elapsed = time.time() - start_time
204
+ progress_val = 0.2 + (elapsed / duration) * 0.6
205
+ progress(min(progress_val, 0.8), desc=f"Recording... {elapsed:.1f}s / {duration}s")
206
+
207
+ # Control frame rate more precisely
208
+ time.sleep(max(0, (1.0 / fps) - 0.01))
209
+
210
+ cap.release()
211
+ out.release()
212
+
213
+ progress(0.9, desc="Finalizing video...")
214
+
215
+ # Verify the video file was created and has content
216
+ if not os.path.exists(video_path) or os.path.getsize(video_path) < 1000:
217
+ return "❌ Video file was not created properly.", None, gr.Button(visible=False)
218
+
219
+ if frame_count == 0:
220
+ try:
221
+ os.unlink(video_path)
222
+ except:
223
+ pass
224
+ return "❌ No frames were captured. Please check your webcam.", None, gr.Button(visible=False)
225
+
226
+ progress(1.0, desc="Recording complete!")
227
+
228
+ print(f"Video saved: {video_path}, Size: {os.path.getsize(video_path)} bytes, Frames: {frame_count}")
229
+
230
+ return (
231
+ f"✅ Webcam video recorded successfully! ({frame_count} frames, {frame_count/fps:.1f}s)",
232
+ video_path,
233
+ gr.Button("🚀 Process Recorded Video", visible=True, variant="secondary")
234
+ )
235
+
236
+ except Exception as e:
237
+ print(f"Webcam capture error: {str(e)}")
238
+ return f"❌ Error with webcam capture: {str(e)}", None, gr.Button(visible=False)
239
+
240
+ def process_webcam_video(video_path, fps_setting, progress=gr.Progress()):
241
+ """Process the recorded webcam video - CLEARS DB FIRST"""
242
+ global models, conn, current_session_id
243
+
244
+ if not video_path:
245
+ return "❌ No video to process", "", ""
246
+
247
+ if models is None or conn is None:
248
+ return "❌ System not initialized", "", ""
249
+
250
+ progress(0.05, desc="Clearing previous data...")
251
+
252
+ # CLEAR DATABASE FOR NEW VIDEO
253
+ clear_database_for_new_video(current_session_id, conn)
254
+
255
+ progress(0.1, desc="Processing recorded video...")
256
+
257
+ try:
258
+ # Create fresh session in database
259
+ cursor = conn.cursor()
260
+ cursor.execute(
261
+ "INSERT INTO video_sessions (session_id) VALUES (?)",
262
+ (current_session_id,)
263
+ )
264
+ conn.commit()
265
+
266
+ progress(0.3, desc="Analyzing video content...")
267
+
268
+ # Set global FPS setting for frame extraction
269
+ global current_fps_setting
270
+ current_fps_setting = fps_setting
271
+
272
+ # Process the recorded video with FPS setting
273
+ process_video_with_fps(video_path, current_session_id, models, conn, fps_setting)
274
+
275
+ progress(0.8, desc="Retrieving results...")
276
+
277
+ # Get results
278
+ captions_text, transcription_text = get_analysis_results()
279
+
280
+ progress(1.0, desc="Complete!")
281
+
282
+ # Clean up temporary file
283
+ try:
284
+ os.unlink(video_path)
285
+ except:
286
+ pass
287
+
288
+ return "✅ Video processed successfully!", captions_text, transcription_text
289
+
290
+ except Exception as e:
291
+ return f"❌ Error processing video: {str(e)}", "", ""
292
+
293
+ def get_analysis_results():
294
+ """Get analysis results for current session"""
295
+ global conn, current_session_id
296
+
297
+ if conn is None:
298
+ return "System not initialized.", "System not initialized."
299
+
300
+ cursor = conn.cursor()
301
+
302
+ # Get captions
303
+ cursor.execute(
304
+ "SELECT timestamp, caption FROM captions WHERE session_id = ? ORDER BY timestamp",
305
+ (current_session_id,)
306
+ )
307
+ captions = cursor.fetchall()
308
+
309
+ if captions:
310
+ captions_text = "\n".join([f"**{timestamp:.1f}s:** {caption}" for timestamp, caption in captions])
311
+ else:
312
+ captions_text = "No captions found. Please process a video first."
313
+
314
+ # Get transcription
315
+ cursor.execute(
316
+ "SELECT transcription FROM transcriptions WHERE session_id = ?",
317
+ (current_session_id,)
318
+ )
319
+ transcription_result = cursor.fetchone()
320
+
321
+ if transcription_result:
322
+ transcription_text = transcription_result[0]
323
+ else:
324
+ transcription_text = "No transcription found. Please process a video with audio."
325
+
326
+ return captions_text, transcription_text
327
+
328
+ def refresh_results():
329
+ """Refresh analysis results"""
330
+ return get_analysis_results()
331
+
332
+ def answer_video_question(question):
333
+ """Answer question about the video"""
334
+ global models, conn, current_session_id
335
+
336
+ if not question.strip():
337
+ return "Please enter a question."
338
+
339
+ if models is None or conn is None:
340
+ return "System not initialized. Please wait for initialization to complete."
341
+
342
+ try:
343
+ context = get_context_for_qa(current_session_id, conn)
344
+
345
+ if context.strip() == "CAPTIONS:":
346
+ return "No video data found. Please process a video first."
347
+
348
+ answer = answer_question(question, context, models)
349
+ return f"**Answer:** {answer}"
350
+
351
+ except Exception as e:
352
+ return f"Error generating answer: {str(e)}"
353
+
354
+ def set_example_question(question):
355
+ """Set example question in the textbox"""
356
+ return question
357
+
358
+ # Initialize system at startup (no database clearing here)
359
+ try:
360
+ init_message = initialize_system()
361
+ print(init_message)
362
+ except Exception as e:
363
+ print(f"Initialization error: {e}")
364
+ models = None
365
+ conn = None
366
+
367
+ # Define example questions
368
+ example_questions = [
369
+ "What objects were visible in the video?",
370
+ "What was the person doing?",
371
+ "What did someone say about [topic]?",
372
+ "What was moving in the scene?",
373
+ "Describe what happened at the beginning/middle/end"
374
+ ]
375
+
376
+ # Create Gradio interface
377
+ with gr.Blocks(title="Video Analysis QA System", theme=gr.themes.Soft()) as demo:
378
+ gr.Markdown("# 🎥 Video Analysis QA System")
379
+ gr.Markdown("Upload a video or use webcam to analyze content and ask questions!")
380
+
381
+ # Store video path for webcam processing
382
+ webcam_video_path = gr.State(value=None)
383
+
384
+ # Main tabs
385
+ with gr.Tabs():
386
+ # Video Input Tab
387
+ with gr.TabItem("📹 Video Input"):
388
+ input_method = gr.Radio(
389
+ choices=["Upload Video", "Use Webcam"],
390
+ value="Upload Video",
391
+ label="Choose input method"
392
+ )
393
+
394
+ # Upload Video Section
395
+ with gr.Group(visible=True) as upload_section:
396
+ gr.Markdown("### Upload Video")
397
+ with gr.Row():
398
+ with gr.Column(scale=3):
399
+ video_upload = gr.File(
400
+ label="Choose a video file",
401
+ file_types=[".mp4", ".avi", ".mov", ".mkv"]
402
+ )
403
+ with gr.Column(scale=1):
404
+ upload_fps = gr.Dropdown(
405
+ choices=[1, 2, 5, 10, 15, 30],
406
+ value=5,
407
+ label="Analysis FPS"
408
+ )
409
+
410
+ video_preview = gr.Video(label="Video Preview")
411
+ upload_btn = gr.Button("🚀 Process Video", variant="primary")
412
+ upload_status = gr.Textbox(label="Status", interactive=False)
413
+
414
+ # Webcam Section
415
+ with gr.Group(visible=False) as webcam_section:
416
+ gr.Markdown("### 📸 Webcam Capture")
417
+
418
+ with gr.Row():
419
+ with gr.Column(scale=2):
420
+ webcam_preview = gr.Image(
421
+ label="Webcam Preview",
422
+ sources=["webcam"],
423
+ streaming=True
424
+ )
425
+
426
+ with gr.Column(scale=1):
427
+ duration_slider = gr.Slider(
428
+ minimum=3,
429
+ maximum=30,
430
+ value=10,
431
+ step=1,
432
+ label="Recording Duration (seconds)"
433
+ )
434
+
435
+ fps_dropdown = gr.Dropdown(
436
+ choices=[1, 2, 5, 10, 15],
437
+ value=5,
438
+ label="Recording FPS"
439
+ )
440
+
441
+ webcam_analysis_fps = gr.Dropdown(
442
+ choices=[1, 2, 5, 10, 15, 30],
443
+ value=5,
444
+ label="Analysis FPS"
445
+ )
446
+
447
+ webcam_info = gr.Markdown("Will capture approximately 50 frames")
448
+ webcam_btn = gr.Button("🔴 Start Recording", variant="primary")
449
+
450
+ # Status and recorded video preview
451
+ webcam_status = gr.Textbox(label="Status", interactive=False)
452
+
453
+ with gr.Row():
454
+ with gr.Column(scale=3):
455
+ recorded_video_preview = gr.Video(label="Recorded Video", visible=True)
456
+ with gr.Column(scale=1):
457
+ process_webcam_btn = gr.Button("🚀 Process Recorded Video", visible=False, variant="secondary", size="lg")
458
+
459
+ # Analysis Results Tab
460
+ with gr.TabItem("🔍 Analysis Results"):
461
+ refresh_btn = gr.Button("🔄 Refresh Results", variant="secondary")
462
+
463
+ with gr.Row():
464
+ with gr.Column():
465
+ gr.Markdown("### Frame Captions")
466
+ captions_output = gr.Textbox(
467
+ label="Captions",
468
+ lines=10,
469
+ max_lines=20,
470
+ interactive=False
471
+ )
472
+
473
+ with gr.Column():
474
+ gr.Markdown("### Audio Transcription")
475
+ transcription_output = gr.Textbox(
476
+ label="Transcription",
477
+ lines=10,
478
+ max_lines=20,
479
+ interactive=False
480
+ )
481
+
482
+ # Ask Questions Tab
483
+ with gr.TabItem("❓ Ask Questions"):
484
+ question_input = gr.Textbox(
485
+ label="Ask a question about the video",
486
+ placeholder="What was moving in the video?",
487
+ lines=2
488
+ )
489
+ ask_btn = gr.Button("🤔 Get Answer", variant="primary")
490
+ answer_output = gr.Textbox(
491
+ label="Answer",
492
+ lines=5,
493
+ max_lines=10,
494
+ interactive=False
495
+ )
496
+
497
+ gr.Markdown("### 💡 Example Questions")
498
+ with gr.Row():
499
+ for i, question in enumerate(example_questions):
500
+ example_btn = gr.Button(question, size="sm")
501
+ example_btn.click(
502
+ fn=set_example_question,
503
+ inputs=[gr.State(question)],
504
+ outputs=[question_input]
505
+ )
506
+
507
+ # Event handlers
508
+ def toggle_input_method(method):
509
+ return (
510
+ gr.Group(visible=(method == "Upload Video")),
511
+ gr.Group(visible=(method == "Use Webcam"))
512
+ )
513
+
514
+ def update_webcam_info(duration, fps):
515
+ estimated_frames = duration * fps
516
+ return f"Will capture approximately {estimated_frames} frames"
517
+
518
+ def preview_video(file):
519
+ return file if file else None
520
+
521
+ def handle_webcam_capture(duration, fps):
522
+ """Handle webcam capture and return results"""
523
+ status, video_path, _ = capture_webcam_video(duration, fps)
524
+
525
+ if video_path:
526
+ return (
527
+ status,
528
+ video_path, # Store path in state
529
+ video_path, # Pass path directly to video component
530
+ gr.Button("🚀 Process Recorded Video", visible=True, variant="secondary")
531
+ )
532
+ else:
533
+ return (
534
+ status,
535
+ None,
536
+ None,
537
+ gr.Button("🚀 Process Recorded Video", visible=False, variant="secondary")
538
+ )
539
+
540
+ # Connect event handlers
541
+ input_method.change(
542
+ fn=toggle_input_method,
543
+ inputs=[input_method],
544
+ outputs=[upload_section, webcam_section]
545
+ )
546
+
547
+ duration_slider.change(
548
+ fn=update_webcam_info,
549
+ inputs=[duration_slider, fps_dropdown],
550
+ outputs=[webcam_info]
551
+ )
552
+
553
+ fps_dropdown.change(
554
+ fn=update_webcam_info,
555
+ inputs=[duration_slider, fps_dropdown],
556
+ outputs=[webcam_info]
557
+ )
558
+
559
+ video_upload.change(
560
+ fn=preview_video,
561
+ inputs=[video_upload],
562
+ outputs=[video_preview]
563
+ )
564
+
565
+ upload_btn.click(
566
+ fn=process_uploaded_video,
567
+ inputs=[video_upload, upload_fps],
568
+ outputs=[upload_status, captions_output, transcription_output]
569
+ )
570
+
571
+ webcam_btn.click(
572
+ fn=handle_webcam_capture,
573
+ inputs=[duration_slider, fps_dropdown],
574
+ outputs=[webcam_status, webcam_video_path, recorded_video_preview, process_webcam_btn]
575
+ )
576
+
577
+ process_webcam_btn.click(
578
+ fn=process_webcam_video,
579
+ inputs=[webcam_video_path, webcam_analysis_fps],
580
+ outputs=[webcam_status, captions_output, transcription_output]
581
+ )
582
+
583
+ refresh_btn.click(
584
+ fn=refresh_results,
585
+ outputs=[captions_output, transcription_output]
586
+ )
587
+
588
+ ask_btn.click(
589
+ fn=answer_video_question,
590
+ inputs=[question_input],
591
+ outputs=[answer_output]
592
+ )
593
+
594
+ # Launch the app
595
+ if __name__ == "__main__":
596
+ demo.launch(
597
+ server_name="0.0.0.0", # Required for Hugging Face Spaces
598
+ server_port=7860, # Standard port for Hugging Face Spaces
599
+ share=False,
600
+ show_error=True
601
+ )
models.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import torch
3
+ from transformers import (
4
+ AutoProcessor, AutoModelForVision2Seq,
5
+ WhisperProcessor, WhisperForConditionalGeneration,
6
+ AutoTokenizer, AutoModelForCausalLM
7
+ )
8
+
9
+ def init_database():
10
+ """Initialize SQLite database"""
11
+ conn = sqlite3.connect('video_analysis.db', check_same_thread=False)
12
+ cursor = conn.cursor()
13
+
14
+ # Create tables
15
+ cursor.execute('''
16
+ CREATE TABLE IF NOT EXISTS video_sessions (
17
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
18
+ session_id TEXT UNIQUE,
19
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
20
+ )
21
+ ''')
22
+
23
+ cursor.execute('''
24
+ CREATE TABLE IF NOT EXISTS captions (
25
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
26
+ session_id TEXT,
27
+ timestamp REAL,
28
+ caption TEXT,
29
+ FOREIGN KEY (session_id) REFERENCES video_sessions (session_id)
30
+ )
31
+ ''')
32
+
33
+ cursor.execute('''
34
+ CREATE TABLE IF NOT EXISTS transcriptions (
35
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
36
+ session_id TEXT,
37
+ transcription TEXT,
38
+ FOREIGN KEY (session_id) REFERENCES video_sessions (session_id)
39
+ )
40
+ ''')
41
+
42
+ conn.commit()
43
+ return conn
44
+
45
+ def load_models():
46
+ """Load all AI models"""
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+
49
+ try:
50
+ # Load captioning model
51
+ print("Loading captioning model...")
52
+ caption_model_id = "quadranttechnologies/qhub-blip-image-captioning-finetuned"
53
+ caption_processor = AutoProcessor.from_pretrained(caption_model_id)
54
+ caption_model = AutoModelForVision2Seq.from_pretrained(caption_model_id).to(device)
55
+
56
+ # Load transcription model
57
+ print("Loading transcription model...")
58
+ whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
59
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to(device)
60
+ whisper_model.config.forced_decoder_ids = None
61
+
62
+ # Load QA model
63
+ print("Loading QA model...")
64
+ qa_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
65
+ qa_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B",torch_dtype="auto",device_map="auto")
66
+
67
+ return {
68
+ 'caption_processor': caption_processor,
69
+ 'caption_model': caption_model,
70
+ 'whisper_processor': whisper_processor,
71
+ 'whisper_model': whisper_model,
72
+ 'qa_tokenizer': qa_tokenizer,
73
+ 'qa_model': qa_model,
74
+ 'device': device
75
+ }
76
+ except Exception as e:
77
+ print(f"Error loading models: {e}")
78
+ return None
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libgl1-mesa-glx
2
+ libglib2.0-0
processing.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ def process_video(video_path, session_id, models, conn):
4
+ """Original process_video function - maintains compatibility"""
5
+
6
+ try:
7
+ # Import your modules
8
+ from captions import extract_frames, generate_caption
9
+ from audio import extract_audio, transcribe_audio
10
+
11
+ # Extract frames with default interval
12
+ print("Extracting frames...")
13
+ frames, timestamps = extract_frames(video_path, interval=0.5)
14
+
15
+ if not frames:
16
+ print("No frames could be extracted from the video.")
17
+ return
18
+
19
+ # Generate captions
20
+ print(f"Generating captions for {len(frames)} frames...")
21
+ cursor = conn.cursor()
22
+
23
+ for i, (frame, timestamp) in enumerate(zip(frames, timestamps)):
24
+ caption = generate_caption(frame, models)
25
+ cursor.execute(
26
+ "INSERT INTO captions (session_id, timestamp, caption) VALUES (?, ?, ?)",
27
+ (session_id, timestamp, caption)
28
+ )
29
+
30
+ # Update status every 10 frames
31
+ if i % 10 == 0:
32
+ print(f"Generating captions... {i+1}/{len(frames)}")
33
+
34
+ conn.commit()
35
+
36
+ # Extract and transcribe audio
37
+ print("Extracting and transcribing audio...")
38
+ audio, sr = extract_audio(video_path)
39
+
40
+ if audio is not None and len(audio) > 0:
41
+ transcription = transcribe_audio(audio, sr, models)
42
+ cursor.execute(
43
+ "INSERT INTO transcriptions (session_id, transcription) VALUES (?, ?)",
44
+ (session_id, transcription)
45
+ )
46
+ conn.commit()
47
+ else:
48
+ print("No audio found in the video or audio extraction failed.")
49
+
50
+ print("Processing complete!")
51
+
52
+ except Exception as e:
53
+ print(f"Error processing video: {str(e)}")
54
+
55
+ def process_video_with_fps(video_path, session_id, models, conn, fps=5):
56
+ """Enhanced process_video function with FPS control"""
57
+
58
+ try:
59
+ # Import your modules
60
+ from captions import extract_frames_with_fps, generate_caption, batch_generate_captions
61
+ from audio import extract_audio, transcribe_audio
62
+
63
+ # Calculate interval from FPS
64
+ interval = 1.0 / fps
65
+
66
+ # Extract frames with custom FPS
67
+ print(f"Extracting frames at {fps} FPS (interval: {interval:.2f}s)...")
68
+ frames, timestamps = extract_frames_with_fps(video_path, interval=interval)
69
+
70
+ if not frames:
71
+ print("No frames could be extracted from the video.")
72
+ return
73
+
74
+ # Generate captions (use batch processing for efficiency)
75
+ print(f"Generating captions for {len(frames)} frames...")
76
+ cursor = conn.cursor()
77
+
78
+ # Option 1: Batch processing (more efficient)
79
+ try:
80
+ captions = batch_generate_captions(frames, models, batch_size=4)
81
+
82
+ # Insert all captions
83
+ for i, (timestamp, caption) in enumerate(zip(timestamps, captions)):
84
+ cursor.execute(
85
+ "INSERT INTO captions (session_id, timestamp, caption) VALUES (?, ?, ?)",
86
+ (session_id, timestamp, caption)
87
+ )
88
+
89
+ if i % 10 == 0:
90
+ print(f"Inserting captions... {i+1}/{len(captions)}")
91
+
92
+ except:
93
+ # Option 2: Fallback to individual processing
94
+ print("Batch processing failed, using individual processing...")
95
+ for i, (frame, timestamp) in enumerate(zip(frames, timestamps)):
96
+ caption = generate_caption(frame, models)
97
+ cursor.execute(
98
+ "INSERT INTO captions (session_id, timestamp, caption) VALUES (?, ?, ?)",
99
+ (session_id, timestamp, caption)
100
+ )
101
+
102
+ if i % 10 == 0:
103
+ print(f"Generating captions... {i+1}/{len(frames)}")
104
+
105
+ conn.commit()
106
+
107
+ # Extract and transcribe audio
108
+ print("Extracting and transcribing audio...")
109
+ audio, sr = extract_audio(video_path)
110
+
111
+ if audio is not None and len(audio) > 0:
112
+ transcription = transcribe_audio(audio, sr, models)
113
+ cursor.execute(
114
+ "INSERT INTO transcriptions (session_id, transcription) VALUES (?, ?)",
115
+ (session_id, transcription)
116
+ )
117
+ conn.commit()
118
+ else:
119
+ print("No audio found in the video or audio extraction failed.")
120
+
121
+ print("Processing complete!")
122
+
123
+ except Exception as e:
124
+ print(f"Error processing video with FPS: {str(e)}")
125
+ # Fallback to original function
126
+ print("Falling back to original processing...")
127
+ process_video(video_path, session_id, models, conn)