Spaces:

milwright
/

reddit-scraper

Paused

App Files Files Community

milwright commited on May 15

Commit

d1ae858

0 Parent(s):

Initial commit with clean slate

Browse files

Files changed (7) hide show

.env.template +10 -0
.gitignore +49 -0
README.md +73 -0
advanced_scraper_ui.py +479 -0
enhanced_scraper.py +219 -0
requirements.txt +5 -0
run_scraper.sh +36 -0

.env.template ADDED Viewed

	@@ -0,0 +1,10 @@

+# Reddit API Credentials
+REDDIT_CLIENT_ID=your_client_id_here
+REDDIT_CLIENT_SECRET=your_client_secret_here
+REDDIT_USER_AGENT=your_user_agent_here
+REDDIT_USERNAME=your_username_here
+REDDIT_PASSWORD=your_password_here
+# Optional Configuration
+MAX_POSTS_PER_SUBREDDIT=100
+CLUSTERING_THRESHOLD=0.3

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+# Environment Variables
+.env
+.env.local
+.env.*.local
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Logs
+*.log
+logs/
+# OS
+.DS_Store
+Thumbs.db
+# Data directories
+csv/
+results/

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# Reddit Web Scraper UI
+A user interface for running Reddit web scraping operations in a local development environment.
+## Features
+- Simple and advanced UI options
+- Search multiple subreddits simultaneously
+- Filter posts by keywords and various criteria
+- Visualize data with interactive charts
+- Export results to CSV or JSON
+- Track search history
+## Installation
+1. Clone this repository
+2. Make sure you have Python 3.7+ installed
+3. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+## Usage
+### Quick Start
+Run the script to launch the UI:
+```bash
+./run_scraper.sh
+```
+For the basic UI mode:
+```bash
+./run_scraper.sh basic
+```
+### Manual Launch
+Alternatively, you can run either UI directly:
+```bash
+# Basic UI
+streamlit run scraper_ui.py
+# Advanced UI
+streamlit run advanced_scraper_ui.py
+```
+## Requirements
+- Python 3.7+
+- Reddit API credentials (provided by default for testing)
+- Dependencies listed in requirements.txt
+## Development
+This project includes:
+- `google_adk.py` - Core file with Reddit scraper functionality
+- `enhanced_scraper.py` - Extended scraper with advanced features
+- `scraper_ui.py` - Basic Streamlit UI
+- `advanced_scraper_ui.py` - Advanced UI with visualizations and filtering
+## License
+This project is for educational purposes only.
+## Note
+The included Reddit API credentials are for demonstration purposes only. For production use, please obtain your own credentials from the [Reddit Developer Portal](https://www.reddit.com/prefs/apps).

advanced_scraper_ui.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import plotly.express as px
+import time
+import os
+import json
+from datetime import datetime
+from enhanced_scraper import EnhancedRedditScraper
+# Page configuration
+st.set_page_config(
+    page_title="Advanced Reddit Scraper",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Add custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        margin-bottom: 1rem;
+    }
+    .subheader {
+        font-size: 1.5rem;
+        color: #ff4500;
+        margin-bottom: 1rem;
+    }
+    .card {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin-bottom: 1rem;
+        border: 1px solid #ddd;
+    }
+    .small-text {
+        font-size: 0.8rem;
+        color: #777;
+    }
+    .stButton button {
+        width: 100%;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Session state initialization
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'scraper' not in st.session_state:
+    st.session_state.scraper = None
+if 'search_history' not in st.session_state:
+    st.session_state.search_history = []
+if 'filters' not in st.session_state:
+    st.session_state.filters = {
+        'min_score': 0,
+        'date_from': None,
+        'date_to': None,
+        'show_only_with_comments': False
+    }
+# Functions
+def initialize_scraper(client_id, client_secret, user_agent):
+    """Initialize the scraper with API credentials"""
+    try:
+        scraper = EnhancedRedditScraper(
+            client_id=client_id,
+            client_secret=client_secret,
+            user_agent=user_agent
+        )
+        st.session_state.scraper = scraper
+        return True
+    except Exception as e:
+        st.error(f"Failed to initialize scraper: {str(e)}")
+        return False
+def run_search(subreddits, keywords, limit, sort_by, include_comments,
+               include_selftext, min_score):
+    """Run the search with provided parameters"""
+    if not st.session_state.scraper:
+        st.error("Scraper not initialized. Please set up API credentials first.")
+        return False
+    try:
+        with st.spinner("Scraping Reddit..."):
+            if len(subreddits) == 1:
+                # Single subreddit search
+                results = st.session_state.scraper.scrape_subreddit(
+                    subreddit_name=subreddits[0],
+                    keywords=keywords,
+                    limit=limit,
+                    sort_by=sort_by,
+                    include_comments=include_comments,
+                    include_selftext=include_selftext,
+                    min_score=min_score
+                )
+                st.session_state.results = {subreddits[0]: results}
+            else:
+                # Multiple subreddit search
+                results = st.session_state.scraper.search_multiple_subreddits(
+                    subreddits=subreddits,
+                    keywords=keywords,
+                    limit=limit,
+                    sort_by=sort_by,
+                    include_comments=include_comments,
+                    include_selftext=include_selftext,
+                    min_score=min_score
+                )
+                st.session_state.results = results
+            # Add to search history
+            search_info = {
+                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                'subreddits': subreddits,
+                'keywords': keywords,
+                'total_results': sum(len(results) for results in st.session_state.results.values())
+            }
+            st.session_state.search_history.append(search_info)
+            return True
+    except Exception as e:
+        st.error(f"Search failed: {str(e)}")
+        return False
+def filter_results(results, filters):
+    """Apply filters to results"""
+    filtered = {}
+    for subreddit, posts in results.items():
+        filtered_posts = []
+        for post in posts:
+            # Apply score filter
+            if post['score'] < filters['min_score']:
+                continue
+            # Apply date filters if set
+            if filters['date_from'] or filters['date_to']:
+                post_date = datetime.strptime(post['created_utc'], '%Y-%m-%d %H:%M:%S')
+                if filters['date_from'] and post_date < filters['date_from']:
+                    continue
+                if filters['date_to'] and post_date > filters['date_to']:
+                    continue
+            # Filter for posts with comments if requested
+            if filters['show_only_with_comments'] and (
+                'matching_comments' not in post or not post['matching_comments']):
+                continue
+            filtered_posts.append(post)
+        filtered[subreddit] = filtered_posts
+    return filtered
+def create_data_visualization(results):
+    """Create data visualizations based on results"""
+    # Combine all results
+    all_posts = []
+    for subreddit, posts in results.items():
+        for post in posts:
+            post['subreddit'] = subreddit
+            all_posts.append(post)
+    if not all_posts:
+        st.warning("No data to visualize.")
+        return
+    df = pd.DataFrame(all_posts)
+    # Create tabs for different visualizations
+    viz_tab1, viz_tab2, viz_tab3 = st.tabs(["Score Distribution", "Posts by Subreddit", "Time Analysis"])
+    with viz_tab1:
+        st.subheader("Score Distribution")
+        fig = px.histogram(df, x="score", color="subreddit", nbins=20,
+                          title="Distribution of Post Scores")
+        st.plotly_chart(fig, use_container_width=True)
+    with viz_tab2:
+        st.subheader("Posts by Subreddit")
+        subreddit_counts = df['subreddit'].value_counts().reset_index()
+        subreddit_counts.columns = ['subreddit', 'count']
+        fig = px.bar(subreddit_counts, x='subreddit', y='count',
+                     title="Number of Matching Posts by Subreddit")
+        st.plotly_chart(fig, use_container_width=True)
+    with viz_tab3:
+        st.subheader("Time Analysis")
+        # Convert created_utc to datetime if it's not already
+        if 'created_utc' in df.columns:
+            df['created_date'] = pd.to_datetime(df['created_utc'])
+            df['hour_of_day'] = df['created_date'].dt.hour
+            fig = px.histogram(df, x="hour_of_day", nbins=24,
+                              title="Posts by Hour of Day")
+            fig.update_layout(xaxis_title="Hour of Day (UTC)")
+            st.plotly_chart(fig, use_container_width=True)
+def main():
+    # Header
+    st.markdown('<div class="main-header">Advanced Reddit Scraper</div>', unsafe_allow_html=True)
+    st.markdown('<div class="subheader">Web Scraping Development Environment</div>', unsafe_allow_html=True)
+    # Sidebar for configuration
+    with st.sidebar:
+        st.header("Configuration")
+        # Credentials
+        with st.expander("Reddit API Credentials", expanded=not st.session_state.scraper):
+            client_id = st.text_input("Client ID", value="aBHOo9oQ3D-liyfGOc34cQ")
+            client_secret = st.text_input("Client Secret", value="4__ziHwdOBNYjlGUG0k7XvK-r5OJDw", type="password")
+            user_agent = st.text_input("User Agent", value="WebScraperUI/1.0")
+            if st.button("Initialize API Connection"):
+                if initialize_scraper(client_id, client_secret, user_agent):
+                    st.success("API connection established!")
+        # Search Parameters
+        st.subheader("Search Parameters")
+        # Multiple subreddit input
+        subreddits_input = st.text_area("Subreddits (one per line)", value="cuny\ncollegequestions")
+        subreddits = [s.strip() for s in subreddits_input.split("\n") if s.strip()]
+        # Keywords input
+        keywords_input = st.text_area("Keywords (one per line)", value="question\nhelp\nconfused")
+        keywords = [k.strip() for k in keywords_input.split("\n") if k.strip()]
+        # Other parameters
+        limit = st.slider("Number of posts to scan per subreddit", 10, 200, 50)
+        sort_by = st.selectbox("Sort posts by", ["hot", "new", "top", "rising"], index=0)
+        include_selftext = st.checkbox("Include post content in search", value=True)
+        include_comments = st.checkbox("Include comments in search", value=True)
+        min_score = st.slider("Minimum score (upvotes)", 0, 1000, 0)
+        # Action buttons
+        search_col, clear_col = st.columns(2)
+        with search_col:
+            search_button = st.button("Run Search", type="primary", use_container_width=True)
+        with clear_col:
+            clear_button = st.button("Clear Results", type="secondary", use_container_width=True)
+    # Main interface tabs
+    tab1, tab2, tab3, tab4 = st.tabs(["Results", "Visualizations", "Export", "History"])
+    # Handle Actions
+    if clear_button:
+        st.session_state.results = None
+        st.rerun()
+    if search_button:
+        if not subreddits:
+            st.error("Please enter at least one subreddit to search.")
+        elif not keywords:
+            st.error("Please enter at least one keyword to search.")
+        else:
+            success = run_search(
+                subreddits=subreddits,
+                keywords=keywords,
+                limit=limit,
+                sort_by=sort_by,
+                include_comments=include_comments,
+                include_selftext=include_selftext,
+                min_score=min_score
+            )
+            if success:
+                st.success(f"Search completed! Found results in {len(st.session_state.results)} subreddits.")
+    # Tab 1: Results
+    with tab1:
+        if st.session_state.results:
+            # Post-search filters
+            st.markdown('<div class="card">', unsafe_allow_html=True)
+            st.subheader("Filter Results")
+            filter_col1, filter_col2, filter_col3 = st.columns(3)
+            with filter_col1:
+                st.session_state.filters['min_score'] = st.number_input(
+                    "Minimum score", min_value=0, value=st.session_state.filters['min_score'])
+            with filter_col2:
+                st.session_state.filters['date_from'] = st.date_input(
+                    "From date", value=None)
+            with filter_col3:
+                st.session_state.filters['date_to'] = st.date_input(
+                    "To date", value=None)
+            st.session_state.filters['show_only_with_comments'] = st.checkbox(
+                "Show only posts with matching comments",
+                value=st.session_state.filters['show_only_with_comments'])
+            apply_filters = st.button("Apply Filters")
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Apply filters if requested
+            if apply_filters:
+                filtered_results = filter_results(st.session_state.results, st.session_state.filters)
+            else:
+                filtered_results = st.session_state.results
+            # Show results for each subreddit
+            total_posts = sum(len(posts) for posts in filtered_results.values())
+            st.subheader(f"Search Results ({total_posts} posts found)")
+            for subreddit, posts in filtered_results.items():
+                with st.expander(f"r/{subreddit} - {len(posts)} posts", expanded=len(filtered_results) == 1):
+                    if posts:
+                        # Create a dataframe for easier viewing
+                        df = pd.DataFrame([{
+                            'Title': p['title'],
+                            'Score': p['score'],
+                            'Comments': p['num_comments'],
+                            'Date': p['created_utc'],
+                            'URL': p['permalink']
+                        } for p in posts])
+                        st.dataframe(df, use_container_width=True)
+                        # Show detailed post view
+                        st.subheader("Post Details")
+                        post_index = st.slider(f"Select post from r/{subreddit}",
+                                                0, max(0, len(posts)-1), 0)
+                        if len(posts) > 0:
+                            post = posts[post_index]
+                            # Display post details in a card
+                            st.markdown('<div class="card">', unsafe_allow_html=True)
+                            st.markdown(f"### {post['title']}")
+                            st.markdown(f"**Author:** u/{post['author']} | **Score:** {post['score']} | **Comments:** {post['num_comments']}")
+                            st.markdown(f"**Posted on:** {post['created_utc']}")
+                            st.markdown(f"**URL:** [{post['url']}]({post['url']})")
+                            if post['text']:
+                                st.markdown("##### Post Content")
+                                with st.container():
+                                    show_content = st.checkbox("Show full content", key=f"content_{subreddit}_{post_index}")
+                                    if show_content:
+                                        st.text(post['text'])
+                            # Show matching comments if available
+                            if 'matching_comments' in post and post['matching_comments']:
+                                st.markdown(f"##### Matching Comments ({len(post['matching_comments'])})")
+                                with st.container():
+                                    show_comments = st.checkbox("Show comments", value=True, key=f"comments_{subreddit}_{post_index}")
+                                    if show_comments:
+                                        for i, comment in enumerate(post['matching_comments']):
+                                            st.markdown(f"**u/{comment['author']}** ({comment['score']} points) - {comment['created_utc']}")
+                                            st.text(comment['body'])
+                                            if i < len(post['matching_comments']) - 1:
+                                                st.divider()
+                            st.markdown('</div>', unsafe_allow_html=True)
+                    else:
+                        st.info(f"No posts found in r/{subreddit} matching the current filters.")
+        else:
+            st.info("Configure the search parameters and click 'Run Search' to begin.")
+            # Show help for first-time users
+            with st.expander("Help & Tips"):
+                st.markdown("""
+                ### Getting Started with Reddit Scraper
+                1. **Set up API credentials** in the sidebar (already pre-filled with sample credentials)
+                2. **Enter subreddits** you want to search (one per line)
+                3. **Enter keywords** to filter posts (one per line)
+                4. Adjust other settings as needed
+                5. Click **Run Search** to start
+                ### Tips for Effective Searches
+                - Use specific keywords to narrow down results
+                - Try searching multiple related subreddits for better coverage
+                - Include comments in search to find discussions where your keywords appear in replies
+                - Use the visualization tab to analyze trends in the results
+                - Export your results for further analysis in other tools
+                """)
+    # Tab 2: Visualizations
+    with tab2:
+        if st.session_state.results:
+            # Apply current filters to visualization data
+            filtered_results = filter_results(st.session_state.results, st.session_state.filters)
+            create_data_visualization(filtered_results)
+        else:
+            st.info("Run a search to generate visualizations.")
+    # Tab 3: Export
+    with tab3:
+        if st.session_state.results:
+            st.subheader("Export Results")
+            # Apply current filters
+            filtered_results = filter_results(st.session_state.results, st.session_state.filters)
+            # Format selection
+            export_format = st.radio("Export format", ["CSV", "JSON"], horizontal=True)
+            # Filename input
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            default_filename = f"reddit_scrape_{timestamp}"
+            filename = st.text_input("Filename (without extension)", value=default_filename)
+            # Export button
+            export_clicked = st.button("Export Data", type="primary")
+            if export_clicked:
+                try:
+                    # Combine all results into a flat list for export
+                    all_results = []
+                    for subreddit, posts in filtered_results.items():
+                        for post in posts:
+                            post_copy = post.copy()
+                            post_copy['subreddit'] = subreddit
+                            all_results.append(post_copy)
+                    # Save results based on selected format
+                    if export_format == "CSV":
+                        # Convert to dataframe and save
+                        df = pd.DataFrame(all_results)
+                        # Handle nested structures for CSV
+                        if 'matching_comments' in df.columns:
+                            df['matching_comments'] = df['matching_comments'].apply(
+                                lambda x: json.dumps(x) if isinstance(x, list) else ''
+                            )
+                        csv_file = f"{filename}.csv"
+                        df.to_csv(csv_file, index=False)
+                        # Create download button
+                        with open(csv_file, 'rb') as f:
+                            st.download_button(
+                                label="Download CSV",
+                                data=f,
+                                file_name=csv_file,
+                                mime="text/csv"
+                            )
+                        st.success(f"Exported {len(all_results)} posts to {csv_file}")
+                    else:  # JSON
+                        json_file = f"{filename}.json"
+                        with open(json_file, 'w') as f:
+                            json.dump(all_results, f, indent=2)
+                        # Create download button
+                        with open(json_file, 'rb') as f:
+                            st.download_button(
+                                label="Download JSON",
+                                data=f,
+                                file_name=json_file,
+                                mime="application/json"
+                            )
+                        st.success(f"Exported {len(all_results)} posts to {json_file}")
+                except Exception as e:
+                    st.error(f"Export failed: {str(e)}")
+        else:
+            st.info("Run a search to export results.")
+    # Tab 4: History
+    with tab4:
+        st.subheader("Search History")
+        if st.session_state.search_history:
+            for i, search in enumerate(reversed(st.session_state.search_history)):
+                with st.expander(f"Search #{len(st.session_state.search_history)-i}: {search['timestamp']} ({search['total_results']} results)"):
+                    st.markdown(f"**Subreddits:** {', '.join(search['subreddits'])}")
+                    st.markdown(f"**Keywords:** {', '.join(search['keywords'])}")
+                    st.markdown(f"**Results:** {search['total_results']} posts")
+                    st.markdown(f"**Time:** {search['timestamp']}")
+        else:
+            st.info("No search history yet.")
+if __name__ == "__main__":
+    main()

enhanced_scraper.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import praw
+import pandas as pd
+import datetime
+import re
+import json
+import os
+from typing import List, Dict, Any, Optional
+class EnhancedRedditScraper:
+    """
+    An enhanced Reddit scraper that provides more advanced functionality
+    than the basic RedditScraperAgent.
+    """
+    def __init__(self, client_id: str, client_secret: str, user_agent: str):
+        """
+        Initialize the Reddit scraper with API credentials.
+        Args:
+            client_id: Reddit API client ID
+            client_secret: Reddit API client secret
+            user_agent: User agent string for Reddit API
+        """
+        self.reddit = praw.Reddit(
+            client_id=client_id,
+            client_secret=client_secret,
+            user_agent=user_agent
+        )
+        self.last_search_results = []
+    def scrape_subreddit(self,
+                         subreddit_name: str,
+                         keywords: List[str],
+                         limit: int = 100,
+                         sort_by: str = "hot",
+                         include_comments: bool = False,
+                         min_score: int = 0,
+                         include_selftext: bool = True) -> List[Dict[str, Any]]:
+        """
+        Scrape a subreddit for posts containing specified keywords.
+        Args:
+            subreddit_name: Name of the subreddit to scrape
+            keywords: List of keywords to search for
+            limit: Maximum number of posts to retrieve
+            sort_by: How to sort posts ('hot', 'new', 'top', 'rising')
+            include_comments: Whether to search post comments
+            min_score: Minimum score (upvotes) for posts
+            include_selftext: Whether to search post content (selftext)
+        Returns:
+            List of matching post dictionaries
+        """
+        subreddit = self.reddit.subreddit(subreddit_name)
+        results = []
+        # Choose the right sort method
+        if sort_by == "hot":
+            submissions = subreddit.hot(limit=limit)
+        elif sort_by == "new":
+            submissions = subreddit.new(limit=limit)
+        elif sort_by == "top":
+            submissions = subreddit.top(limit=limit)
+        elif sort_by == "rising":
+            submissions = subreddit.rising(limit=limit)
+        else:
+            submissions = subreddit.hot(limit=limit)
+        # Process each submission
+        for submission in submissions:
+            # Check if post meets the minimum score requirement
+            if submission.score < min_score:
+                continue
+            # Check for keywords in title or selftext
+            title_match = any(keyword.lower() in submission.title.lower() for keyword in keywords)
+            selftext_match = False
+            if include_selftext:
+                selftext_match = any(keyword.lower() in submission.selftext.lower() for keyword in keywords)
+            comment_match = False
+            comments_data = []
+            # Search comments if enabled
+            if include_comments:
+                submission.comments.replace_more(limit=3)  # Load some MoreComments
+                for comment in submission.comments.list()[:20]:  # Limit to first 20 comments
+                    if any(keyword.lower() in comment.body.lower() for keyword in keywords):
+                        comment_match = True
+                        comments_data.append({
+                            'author': str(comment.author),
+                            'body': comment.body,
+                            'score': comment.score,
+                            'created_utc': datetime.datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
+                        })
+            # Add post to results if it matches criteria
+            if title_match or selftext_match or comment_match:
+                created_time = datetime.datetime.fromtimestamp(submission.created_utc)
+                post_data = {
+                    'title': submission.title,
+                    'text': submission.selftext,
+                    'url': submission.url,
+                    'score': submission.score,
+                    'id': submission.id,
+                    'author': str(submission.author),
+                    'created_utc': created_time.strftime('%Y-%m-%d %H:%M:%S'),
+                    'upvote_ratio': submission.upvote_ratio,
+                    'num_comments': submission.num_comments,
+                    'permalink': f"https://www.reddit.com{submission.permalink}",
+                }
+                if include_comments and comments_data:
+                    post_data['matching_comments'] = comments_data
+                results.append(post_data)
+        # Store last search results
+        self.last_search_results = results
+        return results
+    def search_multiple_subreddits(self,
+                                  subreddits: List[str],
+                                  keywords: List[str],
+                                  **kwargs) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Search multiple subreddits for the same keywords.
+        Args:
+            subreddits: List of subreddit names to search
+            keywords: List of keywords to search for
+            **kwargs: Additional arguments to pass to scrape_subreddit
+        Returns:
+            Dictionary mapping subreddit names to their results
+        """
+        results = {}
+        for subreddit in subreddits:
+            results[subreddit] = self.scrape_subreddit(subreddit, keywords, **kwargs)
+        return results
+    def save_results_to_csv(self, filename: str) -> str:
+        """
+        Save the last search results to a CSV file.
+        Args:
+            filename: Name of the file to save (without extension)
+        Returns:
+            Path to the saved file
+        """
+        if not self.last_search_results:
+            raise ValueError("No search results to save. Run a search first.")
+        df = pd.DataFrame(self.last_search_results)
+        # Clean up comment data for CSV format
+        if 'matching_comments' in df.columns:
+            df['matching_comments'] = df['matching_comments'].apply(
+                lambda x: json.dumps(x) if isinstance(x, list) else ''
+            )
+        # Add timestamp to filename
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        full_filename = f"{filename}_{timestamp}.csv"
+        df.to_csv(full_filename, index=False)
+        return os.path.abspath(full_filename)
+    def save_results_to_json(self, filename: str) -> str:
+        """
+        Save the last search results to a JSON file.
+        Args:
+            filename: Name of the file to save (without extension)
+        Returns:
+            Path to the saved file
+        """
+        if not self.last_search_results:
+            raise ValueError("No search results to save. Run a search first.")
+        # Add timestamp to filename
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        full_filename = f"{filename}_{timestamp}.json"
+        with open(full_filename, 'w', encoding='utf-8') as f:
+            json.dump(self.last_search_results, f, ensure_ascii=False, indent=2)
+        return os.path.abspath(full_filename)
+# Example usage
+if __name__ == "__main__":
+    # Create the scraper instance
+    scraper = EnhancedRedditScraper(
+        client_id="aBHOo9oQ3D-liyfGOc34cQ",
+        client_secret="4__ziHwdOBNYjlGUG0k7XvK-r5OJDw",
+        user_agent="rcuny"
+    )
+    # Simple example
+    results = scraper.scrape_subreddit(
+        subreddit_name="cuny",
+        keywords=["question", "help", "confused"],
+        limit=25,
+        sort_by="hot",
+        include_comments=True
+    )
+    print(f"Found {len(results)} matching posts")
+    # Save results to file
+    if results:
+        csv_path = scraper.save_results_to_csv("reddit_results")
+        json_path = scraper.save_results_to_json("reddit_results")
+        print(f"Results saved to {csv_path} and {json_path}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+praw>=7.7.0
+pandas>=1.3.0
+streamlit>=1.3.0
+plotly>=5.5.0
+matplotlib>=3.5.0

run_scraper.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/bin/bash
+# Script to run the Reddit web scraper UI
+# Check if Python is installed
+if ! command -v python3 &> /dev/null; then
+    echo "Python 3 is required but could not be found. Please install Python 3."
+    exit 1
+fi
+# Check if pip is installed
+if ! command -v pip3 &> /dev/null; then
+    echo "pip3 is required but could not be found. Please install pip."
+    exit 1
+fi
+# Check for virtual environment
+if [[ ! -d "venv" ]]; then
+    echo "Creating virtual environment..."
+    python3 -m venv venv
+fi
+# Activate virtual environment
+echo "Activating virtual environment..."
+source venv/bin/activate
+# Install or update dependencies
+echo "Installing dependencies..."
+pip install -r requirements.txt
+# Run the Reddit scraper UI
+echo "Starting Reddit scraper UI..."
+streamlit run advanced_scraper_ui.py
+# Deactivate virtual environment on exit
+deactivate