milwright commited on
Commit
d1ae858
·
0 Parent(s):

Initial commit with clean slate

Browse files
Files changed (7) hide show
  1. .env.template +10 -0
  2. .gitignore +49 -0
  3. README.md +73 -0
  4. advanced_scraper_ui.py +479 -0
  5. enhanced_scraper.py +219 -0
  6. requirements.txt +5 -0
  7. run_scraper.sh +36 -0
.env.template ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reddit API Credentials
2
+ REDDIT_CLIENT_ID=your_client_id_here
3
+ REDDIT_CLIENT_SECRET=your_client_secret_here
4
+ REDDIT_USER_AGENT=your_user_agent_here
5
+ REDDIT_USERNAME=your_username_here
6
+ REDDIT_PASSWORD=your_password_here
7
+
8
+ # Optional Configuration
9
+ MAX_POSTS_PER_SUBREDDIT=100
10
+ CLUSTERING_THRESHOLD=0.3
.gitignore ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+
28
+ # Environment Variables
29
+ .env
30
+ .env.local
31
+ .env.*.local
32
+
33
+ # IDE
34
+ .idea/
35
+ .vscode/
36
+ *.swp
37
+ *.swo
38
+
39
+ # Logs
40
+ *.log
41
+ logs/
42
+
43
+ # OS
44
+ .DS_Store
45
+ Thumbs.db
46
+
47
+ # Data directories
48
+ csv/
49
+ results/
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reddit Web Scraper UI
2
+
3
+ A user interface for running Reddit web scraping operations in a local development environment.
4
+
5
+ ## Features
6
+
7
+ - Simple and advanced UI options
8
+ - Search multiple subreddits simultaneously
9
+ - Filter posts by keywords and various criteria
10
+ - Visualize data with interactive charts
11
+ - Export results to CSV or JSON
12
+ - Track search history
13
+
14
+ ## Installation
15
+
16
+ 1. Clone this repository
17
+ 2. Make sure you have Python 3.7+ installed
18
+ 3. Install dependencies:
19
+
20
+ ```bash
21
+ pip install -r requirements.txt
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ### Quick Start
27
+
28
+ Run the script to launch the UI:
29
+
30
+ ```bash
31
+ ./run_scraper.sh
32
+ ```
33
+
34
+ For the basic UI mode:
35
+
36
+ ```bash
37
+ ./run_scraper.sh basic
38
+ ```
39
+
40
+ ### Manual Launch
41
+
42
+ Alternatively, you can run either UI directly:
43
+
44
+ ```bash
45
+ # Basic UI
46
+ streamlit run scraper_ui.py
47
+
48
+ # Advanced UI
49
+ streamlit run advanced_scraper_ui.py
50
+ ```
51
+
52
+ ## Requirements
53
+
54
+ - Python 3.7+
55
+ - Reddit API credentials (provided by default for testing)
56
+ - Dependencies listed in requirements.txt
57
+
58
+ ## Development
59
+
60
+ This project includes:
61
+
62
+ - `google_adk.py` - Core file with Reddit scraper functionality
63
+ - `enhanced_scraper.py` - Extended scraper with advanced features
64
+ - `scraper_ui.py` - Basic Streamlit UI
65
+ - `advanced_scraper_ui.py` - Advanced UI with visualizations and filtering
66
+
67
+ ## License
68
+
69
+ This project is for educational purposes only.
70
+
71
+ ## Note
72
+
73
+ The included Reddit API credentials are for demonstration purposes only. For production use, please obtain your own credentials from the [Reddit Developer Portal](https://www.reddit.com/prefs/apps).
advanced_scraper_ui.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import plotly.express as px
5
+ import time
6
+ import os
7
+ import json
8
+ from datetime import datetime
9
+ from enhanced_scraper import EnhancedRedditScraper
10
+
11
+ # Page configuration
12
+ st.set_page_config(
13
+ page_title="Advanced Reddit Scraper",
14
+ page_icon="📊",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded"
17
+ )
18
+
19
+ # Add custom CSS
20
+ st.markdown("""
21
+ <style>
22
+ .main-header {
23
+ font-size: 2.5rem;
24
+ margin-bottom: 1rem;
25
+ }
26
+ .subheader {
27
+ font-size: 1.5rem;
28
+ color: #ff4500;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .card {
32
+ padding: 1rem;
33
+ border-radius: 0.5rem;
34
+ margin-bottom: 1rem;
35
+ border: 1px solid #ddd;
36
+ }
37
+ .small-text {
38
+ font-size: 0.8rem;
39
+ color: #777;
40
+ }
41
+ .stButton button {
42
+ width: 100%;
43
+ }
44
+ </style>
45
+ """, unsafe_allow_html=True)
46
+
47
+ # Session state initialization
48
+ if 'results' not in st.session_state:
49
+ st.session_state.results = None
50
+ if 'scraper' not in st.session_state:
51
+ st.session_state.scraper = None
52
+ if 'search_history' not in st.session_state:
53
+ st.session_state.search_history = []
54
+ if 'filters' not in st.session_state:
55
+ st.session_state.filters = {
56
+ 'min_score': 0,
57
+ 'date_from': None,
58
+ 'date_to': None,
59
+ 'show_only_with_comments': False
60
+ }
61
+
62
+ # Functions
63
+ def initialize_scraper(client_id, client_secret, user_agent):
64
+ """Initialize the scraper with API credentials"""
65
+ try:
66
+ scraper = EnhancedRedditScraper(
67
+ client_id=client_id,
68
+ client_secret=client_secret,
69
+ user_agent=user_agent
70
+ )
71
+ st.session_state.scraper = scraper
72
+ return True
73
+ except Exception as e:
74
+ st.error(f"Failed to initialize scraper: {str(e)}")
75
+ return False
76
+
77
+ def run_search(subreddits, keywords, limit, sort_by, include_comments,
78
+ include_selftext, min_score):
79
+ """Run the search with provided parameters"""
80
+ if not st.session_state.scraper:
81
+ st.error("Scraper not initialized. Please set up API credentials first.")
82
+ return False
83
+
84
+ try:
85
+ with st.spinner("Scraping Reddit..."):
86
+ if len(subreddits) == 1:
87
+ # Single subreddit search
88
+ results = st.session_state.scraper.scrape_subreddit(
89
+ subreddit_name=subreddits[0],
90
+ keywords=keywords,
91
+ limit=limit,
92
+ sort_by=sort_by,
93
+ include_comments=include_comments,
94
+ include_selftext=include_selftext,
95
+ min_score=min_score
96
+ )
97
+ st.session_state.results = {subreddits[0]: results}
98
+ else:
99
+ # Multiple subreddit search
100
+ results = st.session_state.scraper.search_multiple_subreddits(
101
+ subreddits=subreddits,
102
+ keywords=keywords,
103
+ limit=limit,
104
+ sort_by=sort_by,
105
+ include_comments=include_comments,
106
+ include_selftext=include_selftext,
107
+ min_score=min_score
108
+ )
109
+ st.session_state.results = results
110
+
111
+ # Add to search history
112
+ search_info = {
113
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
114
+ 'subreddits': subreddits,
115
+ 'keywords': keywords,
116
+ 'total_results': sum(len(results) for results in st.session_state.results.values())
117
+ }
118
+ st.session_state.search_history.append(search_info)
119
+
120
+ return True
121
+ except Exception as e:
122
+ st.error(f"Search failed: {str(e)}")
123
+ return False
124
+
125
+ def filter_results(results, filters):
126
+ """Apply filters to results"""
127
+ filtered = {}
128
+
129
+ for subreddit, posts in results.items():
130
+ filtered_posts = []
131
+
132
+ for post in posts:
133
+ # Apply score filter
134
+ if post['score'] < filters['min_score']:
135
+ continue
136
+
137
+ # Apply date filters if set
138
+ if filters['date_from'] or filters['date_to']:
139
+ post_date = datetime.strptime(post['created_utc'], '%Y-%m-%d %H:%M:%S')
140
+
141
+ if filters['date_from'] and post_date < filters['date_from']:
142
+ continue
143
+ if filters['date_to'] and post_date > filters['date_to']:
144
+ continue
145
+
146
+ # Filter for posts with comments if requested
147
+ if filters['show_only_with_comments'] and (
148
+ 'matching_comments' not in post or not post['matching_comments']):
149
+ continue
150
+
151
+ filtered_posts.append(post)
152
+
153
+ filtered[subreddit] = filtered_posts
154
+
155
+ return filtered
156
+
157
+ def create_data_visualization(results):
158
+ """Create data visualizations based on results"""
159
+ # Combine all results
160
+ all_posts = []
161
+ for subreddit, posts in results.items():
162
+ for post in posts:
163
+ post['subreddit'] = subreddit
164
+ all_posts.append(post)
165
+
166
+ if not all_posts:
167
+ st.warning("No data to visualize.")
168
+ return
169
+
170
+ df = pd.DataFrame(all_posts)
171
+
172
+ # Create tabs for different visualizations
173
+ viz_tab1, viz_tab2, viz_tab3 = st.tabs(["Score Distribution", "Posts by Subreddit", "Time Analysis"])
174
+
175
+ with viz_tab1:
176
+ st.subheader("Score Distribution")
177
+ fig = px.histogram(df, x="score", color="subreddit", nbins=20,
178
+ title="Distribution of Post Scores")
179
+ st.plotly_chart(fig, use_container_width=True)
180
+
181
+ with viz_tab2:
182
+ st.subheader("Posts by Subreddit")
183
+ subreddit_counts = df['subreddit'].value_counts().reset_index()
184
+ subreddit_counts.columns = ['subreddit', 'count']
185
+ fig = px.bar(subreddit_counts, x='subreddit', y='count',
186
+ title="Number of Matching Posts by Subreddit")
187
+ st.plotly_chart(fig, use_container_width=True)
188
+
189
+ with viz_tab3:
190
+ st.subheader("Time Analysis")
191
+ # Convert created_utc to datetime if it's not already
192
+ if 'created_utc' in df.columns:
193
+ df['created_date'] = pd.to_datetime(df['created_utc'])
194
+ df['hour_of_day'] = df['created_date'].dt.hour
195
+
196
+ fig = px.histogram(df, x="hour_of_day", nbins=24,
197
+ title="Posts by Hour of Day")
198
+ fig.update_layout(xaxis_title="Hour of Day (UTC)")
199
+ st.plotly_chart(fig, use_container_width=True)
200
+
201
+ def main():
202
+ # Header
203
+ st.markdown('<div class="main-header">Advanced Reddit Scraper</div>', unsafe_allow_html=True)
204
+ st.markdown('<div class="subheader">Web Scraping Development Environment</div>', unsafe_allow_html=True)
205
+
206
+ # Sidebar for configuration
207
+ with st.sidebar:
208
+ st.header("Configuration")
209
+
210
+ # Credentials
211
+ with st.expander("Reddit API Credentials", expanded=not st.session_state.scraper):
212
+ client_id = st.text_input("Client ID", value="aBHOo9oQ3D-liyfGOc34cQ")
213
+ client_secret = st.text_input("Client Secret", value="4__ziHwdOBNYjlGUG0k7XvK-r5OJDw", type="password")
214
+ user_agent = st.text_input("User Agent", value="WebScraperUI/1.0")
215
+
216
+ if st.button("Initialize API Connection"):
217
+ if initialize_scraper(client_id, client_secret, user_agent):
218
+ st.success("API connection established!")
219
+
220
+ # Search Parameters
221
+ st.subheader("Search Parameters")
222
+
223
+ # Multiple subreddit input
224
+ subreddits_input = st.text_area("Subreddits (one per line)", value="cuny\ncollegequestions")
225
+ subreddits = [s.strip() for s in subreddits_input.split("\n") if s.strip()]
226
+
227
+ # Keywords input
228
+ keywords_input = st.text_area("Keywords (one per line)", value="question\nhelp\nconfused")
229
+ keywords = [k.strip() for k in keywords_input.split("\n") if k.strip()]
230
+
231
+ # Other parameters
232
+ limit = st.slider("Number of posts to scan per subreddit", 10, 200, 50)
233
+ sort_by = st.selectbox("Sort posts by", ["hot", "new", "top", "rising"], index=0)
234
+ include_selftext = st.checkbox("Include post content in search", value=True)
235
+ include_comments = st.checkbox("Include comments in search", value=True)
236
+ min_score = st.slider("Minimum score (upvotes)", 0, 1000, 0)
237
+
238
+ # Action buttons
239
+ search_col, clear_col = st.columns(2)
240
+ with search_col:
241
+ search_button = st.button("Run Search", type="primary", use_container_width=True)
242
+ with clear_col:
243
+ clear_button = st.button("Clear Results", type="secondary", use_container_width=True)
244
+
245
+ # Main interface tabs
246
+ tab1, tab2, tab3, tab4 = st.tabs(["Results", "Visualizations", "Export", "History"])
247
+
248
+ # Handle Actions
249
+ if clear_button:
250
+ st.session_state.results = None
251
+ st.rerun()
252
+
253
+ if search_button:
254
+ if not subreddits:
255
+ st.error("Please enter at least one subreddit to search.")
256
+ elif not keywords:
257
+ st.error("Please enter at least one keyword to search.")
258
+ else:
259
+ success = run_search(
260
+ subreddits=subreddits,
261
+ keywords=keywords,
262
+ limit=limit,
263
+ sort_by=sort_by,
264
+ include_comments=include_comments,
265
+ include_selftext=include_selftext,
266
+ min_score=min_score
267
+ )
268
+ if success:
269
+ st.success(f"Search completed! Found results in {len(st.session_state.results)} subreddits.")
270
+
271
+ # Tab 1: Results
272
+ with tab1:
273
+ if st.session_state.results:
274
+ # Post-search filters
275
+ st.markdown('<div class="card">', unsafe_allow_html=True)
276
+ st.subheader("Filter Results")
277
+ filter_col1, filter_col2, filter_col3 = st.columns(3)
278
+
279
+ with filter_col1:
280
+ st.session_state.filters['min_score'] = st.number_input(
281
+ "Minimum score", min_value=0, value=st.session_state.filters['min_score'])
282
+
283
+ with filter_col2:
284
+ st.session_state.filters['date_from'] = st.date_input(
285
+ "From date", value=None)
286
+
287
+ with filter_col3:
288
+ st.session_state.filters['date_to'] = st.date_input(
289
+ "To date", value=None)
290
+
291
+ st.session_state.filters['show_only_with_comments'] = st.checkbox(
292
+ "Show only posts with matching comments",
293
+ value=st.session_state.filters['show_only_with_comments'])
294
+
295
+ apply_filters = st.button("Apply Filters")
296
+ st.markdown('</div>', unsafe_allow_html=True)
297
+
298
+ # Apply filters if requested
299
+ if apply_filters:
300
+ filtered_results = filter_results(st.session_state.results, st.session_state.filters)
301
+ else:
302
+ filtered_results = st.session_state.results
303
+
304
+ # Show results for each subreddit
305
+ total_posts = sum(len(posts) for posts in filtered_results.values())
306
+ st.subheader(f"Search Results ({total_posts} posts found)")
307
+
308
+ for subreddit, posts in filtered_results.items():
309
+ with st.expander(f"r/{subreddit} - {len(posts)} posts", expanded=len(filtered_results) == 1):
310
+ if posts:
311
+ # Create a dataframe for easier viewing
312
+ df = pd.DataFrame([{
313
+ 'Title': p['title'],
314
+ 'Score': p['score'],
315
+ 'Comments': p['num_comments'],
316
+ 'Date': p['created_utc'],
317
+ 'URL': p['permalink']
318
+ } for p in posts])
319
+
320
+ st.dataframe(df, use_container_width=True)
321
+
322
+ # Show detailed post view
323
+ st.subheader("Post Details")
324
+ post_index = st.slider(f"Select post from r/{subreddit}",
325
+ 0, max(0, len(posts)-1), 0)
326
+
327
+ if len(posts) > 0:
328
+ post = posts[post_index]
329
+
330
+ # Display post details in a card
331
+ st.markdown('<div class="card">', unsafe_allow_html=True)
332
+ st.markdown(f"### {post['title']}")
333
+ st.markdown(f"**Author:** u/{post['author']} | **Score:** {post['score']} | **Comments:** {post['num_comments']}")
334
+ st.markdown(f"**Posted on:** {post['created_utc']}")
335
+ st.markdown(f"**URL:** [{post['url']}]({post['url']})")
336
+
337
+ if post['text']:
338
+ st.markdown("##### Post Content")
339
+ with st.container():
340
+ show_content = st.checkbox("Show full content", key=f"content_{subreddit}_{post_index}")
341
+ if show_content:
342
+ st.text(post['text'])
343
+
344
+ # Show matching comments if available
345
+ if 'matching_comments' in post and post['matching_comments']:
346
+ st.markdown(f"##### Matching Comments ({len(post['matching_comments'])})")
347
+ with st.container():
348
+ show_comments = st.checkbox("Show comments", value=True, key=f"comments_{subreddit}_{post_index}")
349
+ if show_comments:
350
+ for i, comment in enumerate(post['matching_comments']):
351
+ st.markdown(f"**u/{comment['author']}** ({comment['score']} points) - {comment['created_utc']}")
352
+ st.text(comment['body'])
353
+ if i < len(post['matching_comments']) - 1:
354
+ st.divider()
355
+
356
+ st.markdown('</div>', unsafe_allow_html=True)
357
+ else:
358
+ st.info(f"No posts found in r/{subreddit} matching the current filters.")
359
+ else:
360
+ st.info("Configure the search parameters and click 'Run Search' to begin.")
361
+
362
+ # Show help for first-time users
363
+ with st.expander("Help & Tips"):
364
+ st.markdown("""
365
+ ### Getting Started with Reddit Scraper
366
+
367
+ 1. **Set up API credentials** in the sidebar (already pre-filled with sample credentials)
368
+ 2. **Enter subreddits** you want to search (one per line)
369
+ 3. **Enter keywords** to filter posts (one per line)
370
+ 4. Adjust other settings as needed
371
+ 5. Click **Run Search** to start
372
+
373
+ ### Tips for Effective Searches
374
+
375
+ - Use specific keywords to narrow down results
376
+ - Try searching multiple related subreddits for better coverage
377
+ - Include comments in search to find discussions where your keywords appear in replies
378
+ - Use the visualization tab to analyze trends in the results
379
+ - Export your results for further analysis in other tools
380
+ """)
381
+
382
+ # Tab 2: Visualizations
383
+ with tab2:
384
+ if st.session_state.results:
385
+ # Apply current filters to visualization data
386
+ filtered_results = filter_results(st.session_state.results, st.session_state.filters)
387
+ create_data_visualization(filtered_results)
388
+ else:
389
+ st.info("Run a search to generate visualizations.")
390
+
391
+ # Tab 3: Export
392
+ with tab3:
393
+ if st.session_state.results:
394
+ st.subheader("Export Results")
395
+
396
+ # Apply current filters
397
+ filtered_results = filter_results(st.session_state.results, st.session_state.filters)
398
+
399
+ # Format selection
400
+ export_format = st.radio("Export format", ["CSV", "JSON"], horizontal=True)
401
+
402
+ # Filename input
403
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
404
+ default_filename = f"reddit_scrape_{timestamp}"
405
+ filename = st.text_input("Filename (without extension)", value=default_filename)
406
+
407
+ # Export button
408
+ export_clicked = st.button("Export Data", type="primary")
409
+
410
+ if export_clicked:
411
+ try:
412
+ # Combine all results into a flat list for export
413
+ all_results = []
414
+ for subreddit, posts in filtered_results.items():
415
+ for post in posts:
416
+ post_copy = post.copy()
417
+ post_copy['subreddit'] = subreddit
418
+ all_results.append(post_copy)
419
+
420
+ # Save results based on selected format
421
+ if export_format == "CSV":
422
+ # Convert to dataframe and save
423
+ df = pd.DataFrame(all_results)
424
+
425
+ # Handle nested structures for CSV
426
+ if 'matching_comments' in df.columns:
427
+ df['matching_comments'] = df['matching_comments'].apply(
428
+ lambda x: json.dumps(x) if isinstance(x, list) else ''
429
+ )
430
+
431
+ csv_file = f"{filename}.csv"
432
+ df.to_csv(csv_file, index=False)
433
+
434
+ # Create download button
435
+ with open(csv_file, 'rb') as f:
436
+ st.download_button(
437
+ label="Download CSV",
438
+ data=f,
439
+ file_name=csv_file,
440
+ mime="text/csv"
441
+ )
442
+ st.success(f"Exported {len(all_results)} posts to {csv_file}")
443
+
444
+ else: # JSON
445
+ json_file = f"{filename}.json"
446
+ with open(json_file, 'w') as f:
447
+ json.dump(all_results, f, indent=2)
448
+
449
+ # Create download button
450
+ with open(json_file, 'rb') as f:
451
+ st.download_button(
452
+ label="Download JSON",
453
+ data=f,
454
+ file_name=json_file,
455
+ mime="application/json"
456
+ )
457
+ st.success(f"Exported {len(all_results)} posts to {json_file}")
458
+
459
+ except Exception as e:
460
+ st.error(f"Export failed: {str(e)}")
461
+ else:
462
+ st.info("Run a search to export results.")
463
+
464
+ # Tab 4: History
465
+ with tab4:
466
+ st.subheader("Search History")
467
+
468
+ if st.session_state.search_history:
469
+ for i, search in enumerate(reversed(st.session_state.search_history)):
470
+ with st.expander(f"Search #{len(st.session_state.search_history)-i}: {search['timestamp']} ({search['total_results']} results)"):
471
+ st.markdown(f"**Subreddits:** {', '.join(search['subreddits'])}")
472
+ st.markdown(f"**Keywords:** {', '.join(search['keywords'])}")
473
+ st.markdown(f"**Results:** {search['total_results']} posts")
474
+ st.markdown(f"**Time:** {search['timestamp']}")
475
+ else:
476
+ st.info("No search history yet.")
477
+
478
+ if __name__ == "__main__":
479
+ main()
enhanced_scraper.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import praw
2
+ import pandas as pd
3
+ import datetime
4
+ import re
5
+ import json
6
+ import os
7
+ from typing import List, Dict, Any, Optional
8
+
9
+ class EnhancedRedditScraper:
10
+ """
11
+ An enhanced Reddit scraper that provides more advanced functionality
12
+ than the basic RedditScraperAgent.
13
+ """
14
+
15
+ def __init__(self, client_id: str, client_secret: str, user_agent: str):
16
+ """
17
+ Initialize the Reddit scraper with API credentials.
18
+
19
+ Args:
20
+ client_id: Reddit API client ID
21
+ client_secret: Reddit API client secret
22
+ user_agent: User agent string for Reddit API
23
+ """
24
+ self.reddit = praw.Reddit(
25
+ client_id=client_id,
26
+ client_secret=client_secret,
27
+ user_agent=user_agent
28
+ )
29
+ self.last_search_results = []
30
+
31
+ def scrape_subreddit(self,
32
+ subreddit_name: str,
33
+ keywords: List[str],
34
+ limit: int = 100,
35
+ sort_by: str = "hot",
36
+ include_comments: bool = False,
37
+ min_score: int = 0,
38
+ include_selftext: bool = True) -> List[Dict[str, Any]]:
39
+ """
40
+ Scrape a subreddit for posts containing specified keywords.
41
+
42
+ Args:
43
+ subreddit_name: Name of the subreddit to scrape
44
+ keywords: List of keywords to search for
45
+ limit: Maximum number of posts to retrieve
46
+ sort_by: How to sort posts ('hot', 'new', 'top', 'rising')
47
+ include_comments: Whether to search post comments
48
+ min_score: Minimum score (upvotes) for posts
49
+ include_selftext: Whether to search post content (selftext)
50
+
51
+ Returns:
52
+ List of matching post dictionaries
53
+ """
54
+ subreddit = self.reddit.subreddit(subreddit_name)
55
+ results = []
56
+
57
+ # Choose the right sort method
58
+ if sort_by == "hot":
59
+ submissions = subreddit.hot(limit=limit)
60
+ elif sort_by == "new":
61
+ submissions = subreddit.new(limit=limit)
62
+ elif sort_by == "top":
63
+ submissions = subreddit.top(limit=limit)
64
+ elif sort_by == "rising":
65
+ submissions = subreddit.rising(limit=limit)
66
+ else:
67
+ submissions = subreddit.hot(limit=limit)
68
+
69
+ # Process each submission
70
+ for submission in submissions:
71
+ # Check if post meets the minimum score requirement
72
+ if submission.score < min_score:
73
+ continue
74
+
75
+ # Check for keywords in title or selftext
76
+ title_match = any(keyword.lower() in submission.title.lower() for keyword in keywords)
77
+ selftext_match = False
78
+
79
+ if include_selftext:
80
+ selftext_match = any(keyword.lower() in submission.selftext.lower() for keyword in keywords)
81
+
82
+ comment_match = False
83
+ comments_data = []
84
+
85
+ # Search comments if enabled
86
+ if include_comments:
87
+ submission.comments.replace_more(limit=3) # Load some MoreComments
88
+ for comment in submission.comments.list()[:20]: # Limit to first 20 comments
89
+ if any(keyword.lower() in comment.body.lower() for keyword in keywords):
90
+ comment_match = True
91
+ comments_data.append({
92
+ 'author': str(comment.author),
93
+ 'body': comment.body,
94
+ 'score': comment.score,
95
+ 'created_utc': datetime.datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
96
+ })
97
+
98
+ # Add post to results if it matches criteria
99
+ if title_match or selftext_match or comment_match:
100
+ created_time = datetime.datetime.fromtimestamp(submission.created_utc)
101
+
102
+ post_data = {
103
+ 'title': submission.title,
104
+ 'text': submission.selftext,
105
+ 'url': submission.url,
106
+ 'score': submission.score,
107
+ 'id': submission.id,
108
+ 'author': str(submission.author),
109
+ 'created_utc': created_time.strftime('%Y-%m-%d %H:%M:%S'),
110
+ 'upvote_ratio': submission.upvote_ratio,
111
+ 'num_comments': submission.num_comments,
112
+ 'permalink': f"https://www.reddit.com{submission.permalink}",
113
+ }
114
+
115
+ if include_comments and comments_data:
116
+ post_data['matching_comments'] = comments_data
117
+
118
+ results.append(post_data)
119
+
120
+ # Store last search results
121
+ self.last_search_results = results
122
+ return results
123
+
124
+ def search_multiple_subreddits(self,
125
+ subreddits: List[str],
126
+ keywords: List[str],
127
+ **kwargs) -> Dict[str, List[Dict[str, Any]]]:
128
+ """
129
+ Search multiple subreddits for the same keywords.
130
+
131
+ Args:
132
+ subreddits: List of subreddit names to search
133
+ keywords: List of keywords to search for
134
+ **kwargs: Additional arguments to pass to scrape_subreddit
135
+
136
+ Returns:
137
+ Dictionary mapping subreddit names to their results
138
+ """
139
+ results = {}
140
+ for subreddit in subreddits:
141
+ results[subreddit] = self.scrape_subreddit(subreddit, keywords, **kwargs)
142
+ return results
143
+
144
+ def save_results_to_csv(self, filename: str) -> str:
145
+ """
146
+ Save the last search results to a CSV file.
147
+
148
+ Args:
149
+ filename: Name of the file to save (without extension)
150
+
151
+ Returns:
152
+ Path to the saved file
153
+ """
154
+ if not self.last_search_results:
155
+ raise ValueError("No search results to save. Run a search first.")
156
+
157
+ df = pd.DataFrame(self.last_search_results)
158
+
159
+ # Clean up comment data for CSV format
160
+ if 'matching_comments' in df.columns:
161
+ df['matching_comments'] = df['matching_comments'].apply(
162
+ lambda x: json.dumps(x) if isinstance(x, list) else ''
163
+ )
164
+
165
+ # Add timestamp to filename
166
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
167
+ full_filename = f"{filename}_{timestamp}.csv"
168
+
169
+ df.to_csv(full_filename, index=False)
170
+ return os.path.abspath(full_filename)
171
+
172
+ def save_results_to_json(self, filename: str) -> str:
173
+ """
174
+ Save the last search results to a JSON file.
175
+
176
+ Args:
177
+ filename: Name of the file to save (without extension)
178
+
179
+ Returns:
180
+ Path to the saved file
181
+ """
182
+ if not self.last_search_results:
183
+ raise ValueError("No search results to save. Run a search first.")
184
+
185
+ # Add timestamp to filename
186
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
187
+ full_filename = f"{filename}_{timestamp}.json"
188
+
189
+ with open(full_filename, 'w', encoding='utf-8') as f:
190
+ json.dump(self.last_search_results, f, ensure_ascii=False, indent=2)
191
+
192
+ return os.path.abspath(full_filename)
193
+
194
+
195
+ # Example usage
196
+ if __name__ == "__main__":
197
+ # Create the scraper instance
198
+ scraper = EnhancedRedditScraper(
199
+ client_id="aBHOo9oQ3D-liyfGOc34cQ",
200
+ client_secret="4__ziHwdOBNYjlGUG0k7XvK-r5OJDw",
201
+ user_agent="rcuny"
202
+ )
203
+
204
+ # Simple example
205
+ results = scraper.scrape_subreddit(
206
+ subreddit_name="cuny",
207
+ keywords=["question", "help", "confused"],
208
+ limit=25,
209
+ sort_by="hot",
210
+ include_comments=True
211
+ )
212
+
213
+ print(f"Found {len(results)} matching posts")
214
+
215
+ # Save results to file
216
+ if results:
217
+ csv_path = scraper.save_results_to_csv("reddit_results")
218
+ json_path = scraper.save_results_to_json("reddit_results")
219
+ print(f"Results saved to {csv_path} and {json_path}")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ praw>=7.7.0
2
+ pandas>=1.3.0
3
+ streamlit>=1.3.0
4
+ plotly>=5.5.0
5
+ matplotlib>=3.5.0
run_scraper.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Script to run the Reddit web scraper UI
4
+
5
+ # Check if Python is installed
6
+ if ! command -v python3 &> /dev/null; then
7
+ echo "Python 3 is required but could not be found. Please install Python 3."
8
+ exit 1
9
+ fi
10
+
11
+ # Check if pip is installed
12
+ if ! command -v pip3 &> /dev/null; then
13
+ echo "pip3 is required but could not be found. Please install pip."
14
+ exit 1
15
+ fi
16
+
17
+ # Check for virtual environment
18
+ if [[ ! -d "venv" ]]; then
19
+ echo "Creating virtual environment..."
20
+ python3 -m venv venv
21
+ fi
22
+
23
+ # Activate virtual environment
24
+ echo "Activating virtual environment..."
25
+ source venv/bin/activate
26
+
27
+ # Install or update dependencies
28
+ echo "Installing dependencies..."
29
+ pip install -r requirements.txt
30
+
31
+ # Run the Reddit scraper UI
32
+ echo "Starting Reddit scraper UI..."
33
+ streamlit run advanced_scraper_ui.py
34
+
35
+ # Deactivate virtual environment on exit
36
+ deactivate