Spaces:

MMADS
/

cve-decoder

Running

App Files Files Community

MMADS commited on 8 days ago

Commit

28f96f2

1 Parent(s): 295eab6

fell back to deprecated api

Browse files

Files changed (2) hide show

app.py +183 -492
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -1,56 +1,29 @@
 import json
 import logging
 import os
 from collections import OrderedDict
 from datetime import datetime
-from threading import Lock
-import time
-from typing import Dict, Optional, Tuple
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
 # Configure logging for the application
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
 logger = logging.getLogger(__name__)
 # --- Constants and Global Variables ---
 CURRENT_YEAR = datetime.now().year
-NVD_API_V2_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"
-RESULTS_PER_PAGE = 2000  # Max allowed by the API
-# Thread-safe cache with lock
-CACHE_MAX_SIZE = 5
-DATAFRAME_CACHE: Dict[int, Tuple[pd.DataFrame, float]] = OrderedDict()
-CACHE_LOCK = Lock()
-CACHE_TTL = 3600  # Cache TTL in seconds (1 hour)
-# HTTP session with retry strategy
-SESSION = requests.Session()
-retry_strategy = Retry(
-    total=5,
-    backoff_factor=1,
-    status_forcelist=[429, 500, 502, 503, 504],
-)
-adapter = HTTPAdapter(max_retries=retry_strategy)
-SESSION.mount("http://", adapter)
-SESSION.mount("https://", adapter)
-# NVD API Key from environment variables
-NVD_API_KEY = os.environ.get("NVD_API_KEY")
-if NVD_API_KEY:
-    logger.info("NVD API key found and will be used.")
-    SESSION.headers.update({"apiKey": NVD_API_KEY})
-else:
-    logger.warning("NVD_API_KEY environment variable not set. Using public, rate-limited access.")
 # Profiles for tailoring LLM-generated summaries to different audiences
 AUDIENCE_PROFILES = {
@@ -86,191 +59,83 @@ AUDIENCE_PROFILES = {
     }
 }
-# Valid year range for NVD feeds
-MIN_YEAR = 2002
-MAX_YEAR = CURRENT_YEAR
-# --- Utility Functions ---
-def validate_year(year: int) -> bool:
-    """Validates if the year is within the acceptable range."""
-    return MIN_YEAR <= year <= MAX_YEAR
-def clean_cache() -> None:
-    """Removes expired entries from the cache."""
-    current_time = time.time()
-    with CACHE_LOCK:
-        expired_keys = [
-            key for key, (_, timestamp) in DATAFRAME_CACHE.items()
-            if current_time - timestamp > CACHE_TTL
-        ]
-        for key in expired_keys:
-            if key in DATAFRAME_CACHE:
-                del DATAFRAME_CACHE[key]
-                logger.info(f"Removed expired cache entry for year {key}")
-# --- Data Fetching and Parsing (FIXED for API v2.0) ---
 def get_cve_dataframe(year: int) -> pd.DataFrame:
     """
-    Fetches, parses, and caches CVE data for a specific year from the NVD API 2.0.
-    Returns a pandas DataFrame with thread-safe caching.
     """
-    if not validate_year(year):
-        raise gr.Error(f"Invalid year: {year}. Please select a year between {MIN_YEAR} and {MAX_YEAR}.")
-    clean_cache()
-    with CACHE_LOCK:
-        if year in DATAFRAME_CACHE:
-            logger.info(f"Cache hit for year {year}.")
-            DATAFRAME_CACHE.move_to_end(year)
-            return DATAFRAME_CACHE[year][0].copy()
-    logger.info(f"Cache miss. Fetching NVD data for year {year} from API v2.0.")
-    # Format dates correctly for NVD API v2.0
-    # The API expects dates in the format: YYYY-MM-DDTHH:MM:SS.mmm
-    start_date = f"{year}-01-01T00:00:00.000"
-    end_date = f"{year}-12-31T23:59:59.999"
-    all_vulnerabilities = []
-    start_index = 0
     try:
-        while True:
-            params = {
-                'pubStartDate': start_date,
-                'pubEndDate': end_date,
-                'resultsPerPage': RESULTS_PER_PAGE,
-                'startIndex': start_index
-            }
-            logger.info(f"Requesting CVEs from index {start_index}...")
-            response = SESSION.get(NVD_API_V2_URL, params=params, timeout=60)
-            if response.status_code == 404:
-                logger.error(f"404 Error: URL requested: {response.url}")
-                logger.error(f"Response content: {response.text[:500]}")
-            response.raise_for_status()
-            data = response.json()
-            vulnerabilities = data.get("vulnerabilities", [])
-            all_vulnerabilities.extend(vulnerabilities)
-            total_results = data.get("totalResults", 0)
-            logger.info(f"Retrieved {len(vulnerabilities)} CVEs. Total: {total_results}")
-            start_index += len(vulnerabilities)
-            if start_index >= total_results:
-                break
-            # Rate limiting: 6 seconds with API key, 10 seconds without
-            time.sleep(6 if NVD_API_KEY else 10)
-        if not all_vulnerabilities:
-            logger.warning(f"No CVE data found for year {year}")
-            raise gr.Error(f"No CVE data available for year {year}.")
-        df = parse_cve_items(all_vulnerabilities)
-        with CACHE_LOCK:
-            if len(DATAFRAME_CACHE) >= CACHE_MAX_SIZE:
-                DATAFRAME_CACHE.popitem(last=False)
-            DATAFRAME_CACHE[year] = (df, time.time())
-        logger.info(f"Successfully cached {len(df)} CVEs for year {year}")
-        return df.copy()
-    except requests.exceptions.Timeout:
-        logger.error(f"Timeout while fetching data for {year}")
-        raise gr.Error("Request timed out. The NVD API might be busy. Please try again.")
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP Error for {year}: {e}")
-        raise gr.Error(f"Failed to fetch data for {year}. HTTP Error: {e.response.status_code}")
-    except json.JSONDecodeError as e:
-        logger.error(f"Failed to parse JSON for {year}: {e}")
-        raise gr.Error(f"Data for {year} is corrupted or invalid.")
     except Exception as e:
-        logger.error(f"Unexpected error processing feed for {year}: {e}", exc_info=True)
         raise gr.Error(f"An unexpected error occurred: {str(e)}")
-def parse_cve_items(vulnerabilities: list) -> pd.DataFrame:
     """
-    Extracts vulnerability details from the NVD API v2.0 JSON data.
     """
     rows = []
-    for item in vulnerabilities:
-        cve_data = item.get("cve", {})
-        if not cve_data:
             continue
-        cve_id = cve_data.get("id", "N/A")
-        # Get English description
-        description = "No description available"
-        for desc in cve_data.get("descriptions", []):
-            if desc.get("lang") == "en":
-                description = desc.get("value", description)
-                break
-        published = cve_data.get("published", "N/A")
-        # Parse CVSS metrics (prioritize v3.1, then v3.0, then v2)
-        base_score, severity, attack_vector = None, "N/A", "N/A"
-        metrics = cve_data.get("metrics", {})
-        if "cvssMetricV31" in metrics and metrics["cvssMetricV31"]:
-            metric_data = metrics["cvssMetricV31"][0].get("cvssData", {})
-            base_score = metric_data.get("baseScore")
-            severity = metric_data.get("baseSeverity", "N/A")
-            attack_vector = metric_data.get("attackVector", "N/A")
-        elif "cvssMetricV30" in metrics and metrics["cvssMetricV30"]:
-            metric_data = metrics["cvssMetricV30"][0].get("cvssData", {})
-            base_score = metric_data.get("baseScore")
-            severity = metric_data.get("baseSeverity", "N/A")
-            attack_vector = metric_data.get("attackVector", "N/A")
-        elif "cvssMetricV2" in metrics and metrics["cvssMetricV2"]:
-            metric_data = metrics["cvssMetricV2"][0]
-            cvss_data = metric_data.get("cvssData", {})
-            base_score = cvss_data.get("baseScore")
-            severity = metric_data.get("baseSeverity", "N/A")
-            attack_vector = cvss_data.get("accessVector", "N/A")
-        # Extract CWE IDs
-        cwe_ids = []
-        for weakness in cve_data.get("weaknesses", []):
-            for desc in weakness.get("description", []):
-                if desc.get("lang") == "en":
-                    cwe_id = desc.get("value")
-                    if cwe_id and cwe_id.startswith("CWE-"):
-                        cwe_ids.append(cwe_id)
-        rows.append({
-            "CVE_ID": cve_id,
-            "Description": description,
-            "Published": published[:10] if published and published != "N/A" else "N/A",
-            "Base_Score": base_score,
-            "Severity": severity.upper() if severity and severity != "N/A" else "N/A",
-            "Attack_Vector": attack_vector.upper() if attack_vector and attack_vector != "N/A" else "N/A",
-            "CWE_IDs": ", ".join(cwe_ids) if cwe_ids else "N/A"
-        })
-    if not rows:
-        logger.warning("No valid CVE items could be parsed")
-        return pd.DataFrame()
     df = pd.DataFrame(rows)
-    df["Base_Score"] = pd.to_numeric(df["Base_Score"], errors='coerce')
-    df = df.sort_values("Published", ascending=False, na_position='last').reset_index(drop=True)
     return df
@@ -281,341 +146,167 @@ def generate_tailored_summary(cve_description: str, audience: str, hf_token: str
     Generates a tailored CVE summary using the Hugging Face Inference API.
     """
     if not hf_token:
-        return "⚠️ Hugging Face API token is not configured. Please set the HF_TOKEN environment variable."
-    if not cve_description or cve_description == "":
-        return "Please select a CVE from the table first."
-    if audience not in AUDIENCE_PROFILES:
-        return "Invalid audience selected."
     api_url = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
     headers = {"Authorization": f"Bearer {hf_token}"}
-    profile = AUDIENCE_PROFILES[audience]
     prompt = f"""<s>[INST] You are an expert cybersecurity analyst. Your task is to rewrite the following technical CVE description into a concise, actionable summary for a specific professional audience.
-**Target Audience:** {audience}
-- **Focus:** {profile.get('focus', 'N/A')}
-- **Key Priorities:** {', '.join(profile.get('priorities', []))}
-**Original CVE Description:**
----
-{cve_description}
----
-Provide a clear, concise summary (max 200 words) in a {profile.get('tone', 'professional')} tone, focusing on what matters most to this audience. Include actionable insights and recommendations. [/INST]"""
-    payload = {
-        "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": 256,
-            "temperature": 0.7,
-            "top_p": 0.95,
-            "return_full_text": False
-        }
-    }
     try:
-        response = SESSION.post(api_url, headers=headers, json=payload, timeout=60)
-        if response.status_code == 503:
-            return "⏳ The model is currently loading. Please try again in a few moments."
-        elif response.status_code == 401:
-            return "❌ Invalid API token. Please check your Hugging Face token."
-        elif response.status_code != 200:
-            error_data = response.json()
-            error_message = error_data.get("error", "Unknown error")
-            logger.error(f"Inference API Error ({response.status_code}): {error_message}")
-            return f"⚠️ API Error: {error_message}"
-        result = response.json()
-        if isinstance(result, list) and len(result) > 0:
-            generated_text = result[0].get('generated_text', '').strip()
-            if generated_text:
-                return f"### Tailored Summary for {audience}\n\n{generated_text}"
-            else:
-                return "⚠️ The model returned an empty response. Please try again."
-        else:
-            return "⚠️ Unexpected response format from the API."
-    except requests.exceptions.Timeout:
-        logger.error("Timeout while calling Inference API")
-        return "⏱️ Request timed out. The model might be overloaded. Please try again."
-    except Exception as e:
-        logger.error(f"Unexpected error in generate_tailored_summary: {e}")
-        return f"❌ An unexpected error occurred: {str(e)}"
 # --- Analysis and Visualization ---
-def analyze_and_visualize(
-    df: Optional[pd.DataFrame],
-    severity: str,
-    vector: str,
-    search: str
-) -> Tuple[pd.DataFrame, Optional[px.bar], Optional[px.line], str]:
     """
-    Filters the main DataFrame and generates all outputs.
     """
     if df is None or df.empty:
-        empty_df = pd.DataFrame(columns=["CVE_ID", "Severity", "Base_Score", "Description"])
-        return empty_df, None, None, "### No Data Loaded\n\nPlease select a year to load CVE data."
-    try:
-        filtered_df = df.copy()
-        if severity and severity != "All":
-            filtered_df = filtered_df[filtered_df["Severity"] == severity]
-        if vector and vector != "All":
-            filtered_df = filtered_df[filtered_df["Attack_Vector"] == vector]
-        if search and search.strip():
-            search_term = search.strip()
-            masks = [
-                filtered_df[col].str.contains(search_term, case=False, na=False)
-                for col in ["CVE_ID", "Description", "CWE_IDs"] if col in filtered_df.columns
-            ]
-            if masks:
-                combined_mask = pd.concat(masks, axis=1).any(axis=1)
-                filtered_df = filtered_df[combined_mask]
-        severity_chart = create_severity_chart(filtered_df)
-        timeline_chart = create_timeline_chart(filtered_df)
-        summary_text = create_summary_text(filtered_df)
-        display_columns = ["CVE_ID", "Severity", "Base_Score", "Description"]
-        display_df = filtered_df[[col for col in display_columns if col in filtered_df.columns]]
-        return display_df, severity_chart, timeline_chart, summary_text
-    except Exception as e:
-        logger.error(f"Error in analyze_and_visualize: {e}", exc_info=True)
-        empty_df = pd.DataFrame(columns=["CVE_ID", "Severity", "Base_Score", "Description"])
-        return empty_df, None, None, f"### Error\n\nAn error occurred while filtering data: {str(e)}"
-def create_severity_chart(df: pd.DataFrame) -> Optional[px.bar]:
     """Creates a bar chart for CVE severity distribution."""
-    if df.empty or "Severity" not in df.columns:
-        return None
-    try:
-        order = ["CRITICAL", "HIGH", "MEDIUM", "LOW", "N/A"]
-        counts = df["Severity"].value_counts().reindex(order, fill_value=0)
-        color_map = {"CRITICAL": "#8B0000", "HIGH": "#FF4500", "MEDIUM": "#FFA500", "LOW": "#FFD700", "N/A": "#D3D3D3"}
-        fig = px.bar(
-            x=counts.index, y=counts.values,
-            labels={"x": "Severity Level", "y": "Number of CVEs"},
-            title="CVE Severity Distribution",
-            color=counts.index, color_discrete_map=color_map, text=counts.values
-        )
-        fig.update_traces(texttemplate='%{text}', textposition='outside')
-        fig.update_layout(showlegend=False, xaxis={'categoryorder': 'array', 'categoryarray': order})
-        return fig
-    except Exception as e:
-        logger.error(f"Error creating severity chart: {e}")
-        return None
-def create_timeline_chart(df: pd.DataFrame) -> Optional[px.line]:
     """Creates a line chart showing CVE publications over time."""
-    if df.empty or 'Published' not in df.columns:
-        return None
-    try:
-        df_copy = df.copy()
-        df_copy["Date"] = pd.to_datetime(df_copy["Published"], errors='coerce')
-        df_copy.dropna(subset=["Date"], inplace=True)
-        if df_copy.empty: return None
-        counts = df_copy.set_index("Date").resample('M').size()
-        if counts.empty: return None
-        fig = px.line(
-            x=counts.index, y=counts.values,
-            labels={"x": "Month", "y": "Number of CVEs"},
-            title="CVE Publications Timeline", markers=True
-        )
-        return fig
-    except Exception as e:
-        logger.error(f"Error creating timeline chart: {e}")
-        return None
 def create_summary_text(df: pd.DataFrame) -> str:
-    """Generates a markdown string with key statistics."""
-    if df.empty:
-        return "### No Results\n\nNo CVEs match your current filter criteria."
-    try:
-        total_cves = len(df)
-        sev_counts = df['Severity'].value_counts() if 'Severity' in df.columns else {}
-        scores = df['Base_Score'].dropna()
-        avg_score = f"{scores.mean():.2f}" if not scores.empty else "N/A"
-        max_score = f"{scores.max():.1f}" if not scores.empty else "N/A"
-        return "\n".join([
-            f"### Summary Statistics",
-            f"- **Total CVEs Found:** {total_cves:,}",
-            f"- **Critical:** {sev_counts.get('CRITICAL', 0):,}",
-            f"- **High:** {sev_counts.get('HIGH', 0):,}",
-            f"- **Medium:** {sev_counts.get('MEDIUM', 0):,}",
-            f"- **Low:** {sev_counts.get('LOW', 0):,}",
-            f"- **Average Base Score:** {avg_score}",
-            f"- **Maximum Base Score:** {max_score}"
-        ])
-    except Exception as e:
-        logger.error(f"Error creating summary text: {e}")
-        return f"### Error\n\nCould not generate summary: {str(e)}"
 # --- Gradio UI and Event Logic ---
 def create_dashboard():
-    """Builds the entire Gradio interface."""
-    with gr.Blocks(theme=gr.themes.Soft(), title="CVE Dashboard - NVD API v2.0 Analyzer") as dashboard:
-        df_state = gr.State(value=None)
-        selected_cve_description = gr.State(value="")
-        hf_token_state = gr.State(value=os.environ.get("HF_TOKEN", ""))
-        gr.Markdown(
-            """
-            # 🛡️ CVE Dashboard: NVD API v2.0 Analyzer
-            Explore Common Vulnerabilities and Exposures (CVE) data from the National Vulnerability Database, fetched live using the NVD API 2.0.
-            **Note:** For faster loading times, set an `NVD_API_KEY` in your environment. You can request one from the [NVD website](https://nvd.nist.gov/developers/request-an-api-key).
-            """
-        )
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### 🎛️ Controls")
-                year_dd = gr.Dropdown(
-                    choices=list(range(MIN_YEAR, MAX_YEAR + 1))[::-1], value=CURRENT_YEAR - 1,
-                    label="1. Select Year", info="Choose a year to load CVE data"
-                )
-                gr.Markdown("### 🔍 Filters")
-                severity_dd = gr.Dropdown(
-                    choices=["All", "CRITICAL", "HIGH", "MEDIUM", "LOW"], value="All",
-                    label="2. Severity Level", info="Filter by CVSS severity rating"
-                )
-                vector_dd = gr.Dropdown(
-                    choices=["All", "NETWORK", "ADJACENT_NETWORK", "LOCAL", "PHYSICAL"], value="All",
-                    label="3. Attack Vector", info="Filter by attack vector type"
-                )
-                search_tb = gr.Textbox(
-                    label="4. Search", placeholder="e.g., 'Log4j', 'SQL injection', 'CWE-89'...",
-                    info="Search in CVE IDs, descriptions, and CWE IDs"
-                )
-                filter_btn = gr.Button("🔄 Apply Filters", variant="primary", size="lg")
             with gr.Column(scale=3):
-                summary_out = gr.Markdown(value="### Loading...")
                 with gr.Tabs():
                     with gr.TabItem("📊 Data Table"):
-                        table_out = gr.DataFrame(
-                            headers=["CVE_ID", "Severity", "Base_Score", "Description"],
-                            wrap=True,
-                            row_count=20,
-                            interactive=True,
-                            label="CVE Data"
-                        )
-                    with gr.TabItem("📈 Severity Analysis"):
-                        plot_severity_out = gr.Plot(label="Severity Distribution")
-                    with gr.TabItem("📉 Timeline Analysis"):
-                        plot_timeline_out = gr.Plot(label="Publication Timeline")
-                with gr.Accordion(
-                    "🤖 AI-Powered CVE Analysis (Select a CVE from the table)",
-                    open=False, visible=False
-                ) as llm_accordion:
                     with gr.Row():
                         with gr.Column(scale=2):
-                            original_desc_out = gr.Textbox(
-                                label="Original CVE Description", lines=6, interactive=False, show_copy_button=True
-                            )
                         with gr.Column(scale=1):
-                            audience_dd = gr.Dropdown(
-                                choices=list(AUDIENCE_PROFILES.keys()), value="Cybersecurity Professional",
-                                label="Target Audience", info="Select your role for a tailored summary"
-                            )
-                            generate_btn = gr.Button("✨ Generate Tailored Summary", variant="primary")
-                    summary_llm_out = gr.Markdown(value="*Select an audience and click 'Generate'...*")
         def on_year_change(year):
-            """Handle year selection change."""
-            try:
-                if year is None:
-                    return None, pd.DataFrame(), None, None, "### Please select a year"
-                df = get_cve_dataframe(int(year))
-                return df, *analyze_and_visualize(df, "All", "All", "")
-            except Exception as e:
-                logger.error(f"Error in on_year_change: {e}")
-                return None, pd.DataFrame(), None, None, f"### Error\n\n{str(e)}"
-        def on_select_cve(full_df: pd.DataFrame, evt: gr.SelectData):
-            """Handle CVE row selection safely."""
-            try:
-                if full_df is None or evt.value is None:
-                    return "", "", gr.update(visible=False)
-                # Extract the CVE_ID from the first column of the selected row
-                if hasattr(evt, 'index') and isinstance(evt.index, list) and len(evt.index) >= 2:
-                    row_idx = evt.index[0]
-                    selected_cve_id = full_df.iloc[row_idx]["CVE_ID"]
-                else:
-                    # Fallback: try to use the value directly
-                    selected_cve_id = evt.value
-                cve_record = full_df[full_df["CVE_ID"] == selected_cve_id]
-                if cve_record.empty:
-                    return "", "Could not find details for the selected CVE.", gr.update(visible=False)
-                full_description = cve_record.iloc[0]["Description"]
-                return full_description, full_description, gr.update(visible=True)
-            except Exception as e:
-                logger.error(f"Error in on_select_cve: {e}", exc_info=True)
-                return "", "Error loading CVE details", gr.update(visible=False)
-        analysis_outputs = [table_out, plot_severity_out, plot_timeline_out, summary_out]
         filter_inputs = [df_state, severity_dd, vector_dd, search_tb]
-        year_dd.change(
-            fn=on_year_change, inputs=[year_dd],
-            outputs=[df_state, *analysis_outputs], show_progress="full"
-        )
-        dashboard.load(
-            fn=on_year_change, inputs=[year_dd],
-            outputs=[df_state, *analysis_outputs], show_progress="full"
-        )
-        filter_btn.click(
-            fn=analyze_and_visualize, inputs=filter_inputs, outputs=analysis_outputs
-        )
-        search_tb.submit(
-            fn=analyze_and_visualize, inputs=filter_inputs, outputs=analysis_outputs
-        )
-        for control in [severity_dd, vector_dd]:
-            control.change(
-                fn=analyze_and_visualize, inputs=filter_inputs, outputs=analysis_outputs
-            )
-        table_out.select(
-            fn=on_select_cve,
-            inputs=[df_state],
-            outputs=[selected_cve_description, original_desc_out, llm_accordion],
-            show_progress="hidden"
-        )
-        generate_btn.click(
-            fn=generate_tailored_summary,
-            inputs=[selected_cve_description, audience_dd, hf_token_state],
-            outputs=[summary_llm_out]
-        )
     return dashboard
 if __name__ == "__main__":
-    try:
-        if not os.environ.get("HF_TOKEN"):
-            logger.warning("HF_TOKEN not found. AI features will be limited.")
-        cve_dashboard = create_dashboard()
-        cve_dashboard.launch(server_name="0.0.0.0", show_error=True)
-    except Exception as e:
-        logger.error(f"Failed to launch application: {e}", exc_info=True)
-        raise

 import json
 import logging
+import gzip
 import os
 from collections import OrderedDict
 from datetime import datetime
+from io import BytesIO
+from typing import Dict
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import requests
 # Configure logging for the application
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- Constants and Global Variables ---
 CURRENT_YEAR = datetime.now().year
+NVD_BASE_URL = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{year}.json.gz"
+# In-memory LRU cache (by insertion order) to store DataFrames for recent years.
+CACHE_MAX_SIZE = 3
+DATAFRAME_CACHE: Dict[int, pd.DataFrame] = OrderedDict()
 # Profiles for tailoring LLM-generated summaries to different audiences
 AUDIENCE_PROFILES = {
     }
 }
+# --- Data Fetching and Parsing ---
 def get_cve_dataframe(year: int) -> pd.DataFrame:
     """
+    Downloads, parses, and caches the NVD feed for a specific year.
+    It returns a pandas DataFrame. Caching is used to avoid repeated downloads.
     """
+    if year in DATAFRAME_CACHE:
+        logger.info(f"Cache hit for year {year}.")
+        DATAFRAME_CACHE.move_to_end(year) # Mark as recently used
+        return DATAFRAME_CACHE[year]
+    logger.info(f"Cache miss. Downloading NVD data for year {year}.")
+    url = NVD_BASE_URL.format(year=year)
     try:
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
+            nvd_data = json.load(f)
+        df = parse_cve_items(nvd_data)
+        if len(DATAFRAME_CACHE) >= CACHE_MAX_SIZE:
+            DATAFRAME_CACHE.popitem(last=False)
+        DATAFRAME_CACHE[year] = df
+        return df
     except requests.exceptions.HTTPError as e:
         logger.error(f"HTTP Error for {year}: {e}")
+        raise gr.Error(f"Failed to download data for {year}. The feed may be unavailable.")
     except Exception as e:
+        logger.error(f"Error processing feed for {year}: {e}")
         raise gr.Error(f"An unexpected error occurred: {str(e)}")
+def parse_cve_items(nvd_data: dict) -> pd.DataFrame:
     """
+    Extracts vulnerability details from the raw NVD JSON data into a structured DataFrame.
     """
     rows = []
+    for item in nvd_data.get("CVE_Items", []):
+        try:
+            cve_id = item.get("cve", {}).get("CVE_data_meta", {}).get("ID", "N/A")
+            desc_data = item.get("cve", {}).get("description", {}).get("description_data", [])
+            description = desc_data[0].get("value", "No description") if desc_data else "No description"
+            published = item.get("publishedDate", "")
+            base_score, severity, attack_vector = None, "N/A", "N/A"
+            if "baseMetricV3" in item.get("impact", {}):
+                impact_v3 = item["impact"]["baseMetricV3"]["cvssV3"]
+                base_score = impact_v3.get("baseScore")
+                severity = impact_v3.get("baseSeverity")
+                attack_vector = impact_v3.get("attackVector")
+            elif "baseMetricV2" in item.get("impact", {}):
+                impact_v2 = item["impact"]["baseMetricV2"]
+                base_score = impact_v2["cvssV2"].get("baseScore")
+                severity = impact_v2.get("severity")
+                attack_vector = impact_v2.get("accessVector")
+            problem_types = item.get("cve", {}).get("problemtype", {}).get("problemtype_data", [])
+            cwe_ids = [desc["value"] for pt in problem_types for desc in pt.get("description", []) if desc.get("value", "").startswith("CWE-")]
+            rows.append({
+                "CVE_ID": cve_id, "Description": description, "Published": published[:10],
+                "Base_Score": base_score, "Severity": severity, "Attack_Vector": attack_vector,
+                "CWE_IDs": ", ".join(cwe_ids) if cwe_ids else "N/A"
+            })
+        except Exception as e:
+            cve_id_str = cve_id if 'cve_id' in locals() else "Unknown"
+            logger.warning(f"Skipping malformed CVE item ({cve_id_str}): {e}")
             continue
     df = pd.DataFrame(rows)
+    if "Base_Score" in df.columns:
+        df["Base_Score"] = pd.to_numeric(df["Base_Score"], errors='coerce')
     return df
     Generates a tailored CVE summary using the Hugging Face Inference API.
     """
     if not hf_token:
+        raise gr.Error("Hugging Face API token is not configured as a Space Secret.")
+    if not cve_description or not audience:
+        return "Please select a CVE and an audience first."
     api_url = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
     headers = {"Authorization": f"Bearer {hf_token}"}
+    profile = AUDIENCE_PROFILES.get(audience, {})
     prompt = f"""<s>[INST] You are an expert cybersecurity analyst. Your task is to rewrite the following technical CVE description into a concise, actionable summary for a specific professional audience.
+    **Target Audience:** {audience}
+    - **Focus:** {profile.get('focus', 'N/A')}
+    - **Key Priorities:** {', '.join(profile.get('priorities', []))}
+    **Original CVE Description:**
+    ---
+    {cve_description}
+    ---
+    Rewrite the description in a {profile.get('tone', 'professional')} tone, focusing on what matters most to this audience. Do not start with "As a [role]...". Directly provide the summary. [/INST]"""
+    payload = {"inputs": prompt, "parameters": {"max_new_tokens": 256, "return_full_text": False}}
     try:
+        response = requests.post(api_url, headers=headers, json=payload, timeout=45)
+        if response.status_code != 200:
+            error_message = response.json().get("error", "Unknown error")
+            logger.error(f"Inference API Error: {error_message}")
+            return f"Error from API: {error_message}. The model might be loading, please try again."
+        return response.json()[0]['generated_text'].strip()
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request to Inference API failed: {e}")
+        return f"Error: Could not connect to the Hugging Face API. {e}"
 # --- Analysis and Visualization ---
+def analyze_and_visualize(df: pd.DataFrame, severity: str, vector: str, search: str):
     """
+    Filters the main DataFrame and generates all outputs: a filtered table,
+    visualizations, and a summary markdown string.
     """
     if df is None or df.empty:
+        return pd.DataFrame(), None, None, "### No Data Loaded"
+    filtered_df = df.copy()
+    if severity != "All":
+        filtered_df = filtered_df[filtered_df["Severity"] == severity]
+    if vector != "All":
+        filtered_df = filtered_df[filtered_df["Attack_Vector"] == vector]
+    if search:
+        mask = (filtered_df["CVE_ID"].str.contains(search, case=False, na=False) |
+                filtered_df["Description"].str.contains(search, case=False, na=False) |
+                filtered_df["CWE_IDs"].str.contains(search, case=False, na=False))
+        filtered_df = filtered_df[mask]
+    return filtered_df, create_severity_chart(filtered_df), create_timeline_chart(filtered_df), create_summary_text(filtered_df)
+def create_severity_chart(df: pd.DataFrame):
     """Creates a bar chart for CVE severity distribution."""
+    if df.empty: return None
+    order = ["CRITICAL", "HIGH", "MEDIUM", "LOW", "N/A"]
+    counts = df["Severity"].value_counts().reindex(order, fill_value=0)
+    color_map = {"CRITICAL": "#8B0000", "HIGH": "#FF4500", "MEDIUM": "#FFA500", "LOW": "#FFD700", "N/A": "#D3D3D3"}
+    fig = px.bar(counts, x=counts.index, y=counts.values, labels={"x": "Severity", "y": "Count"},
+                 title="CVE Severity Distribution", color=counts.index, color_discrete_map=color_map, text_auto=True)
+    fig.update_layout(showlegend=False, xaxis={'categoryorder':'array', 'categoryarray':order})
+    return fig
+def create_timeline_chart(df: pd.DataFrame):
     """Creates a line chart showing CVE publications over time."""
+    if df.empty or 'Published' not in df.columns: return None
+    df_copy = df.copy()
+    df_copy["Date"] = pd.to_datetime(df_copy["Published"], errors='coerce')
+    df_copy.dropna(subset=["Date"], inplace=True)
+    if df_copy.empty: return None
+    counts = df_copy.set_index("Date").resample('M').size()
+    fig = px.line(x=counts.index, y=counts.values, labels={"x": "Month", "y": "Number of CVEs"},
+                  title="CVE Publications Timeline", markers=True)
+    return fig
 def create_summary_text(df: pd.DataFrame) -> str:
+    """Generates a markdown string with key statistics from the DataFrame."""
+    if df.empty: return "### No results match your filter criteria."
+    scores = df['Base_Score'].dropna()
+    avg_score = f"{scores.mean():.2f}" if not scores.empty else 'N/A'
+    return f"""### Summary Statistics
+- **Total CVEs Found:** {len(df):,}
+- **Critical:** {len(df[df['Severity'] == 'CRITICAL']):,}
+- **High:** {len(df[df['Severity'] == 'HIGH']):,}
+- **Average Base Score:** {avg_score}"""
 # --- Gradio UI and Event Logic ---
 def create_dashboard():
+    """Builds the entire Gradio interface and defines event handling."""
+    with gr.Blocks(theme=gr.themes.Soft(), title="CVE Dashboard") as dashboard:
+        df_state = gr.State()
+        selected_cve_description = gr.State("")
+        hf_token_state = gr.State(os.environ.get("HF_TOKEN"))
+        gr.Markdown("# CVE Dashboard: NVD Feed Analyzer")
+        gr.Markdown("Explore CVE data from the National Vulnerability Database. **Note:** This demo uses deprecated NVD JSON feeds; a production app should use the NVD API 2.0.")
         with gr.Row():
             with gr.Column(scale=1):
+                year_dd = gr.Dropdown(choices=list(range(2002, CURRENT_YEAR + 1))[::-1], value=CURRENT_YEAR, label="1. Select Year")
+                severity_dd = gr.Dropdown(choices=["All", "CRITICAL", "HIGH", "MEDIUM", "LOW"], value="All", label="2. Filter by Severity")
+                vector_dd = gr.Dropdown(choices=["All", "NETWORK", "ADJACENT_NETWORK", "LOCAL", "PHYSICAL"], value="All", label="3. Filter by Attack Vector")
+                search_tb = gr.Textbox(label="4. Search Keyword", placeholder="e.g., 'Log4j', 'CWE-79', ...")
+                filter_btn = gr.Button("Apply Filters", variant="primary")
             with gr.Column(scale=3):
+                summary_out = gr.Markdown()
                 with gr.Tabs():
                     with gr.TabItem("📊 Data Table"):
+                        table_out = gr.DataFrame(headers=["CVE_ID", "Severity", "Base_Score", "Description"], wrap=True, max_rows=15, interactive=True)
+                    with gr.TabItem("📈 Severity Chart"):
+                        plot_severity_out = gr.Plot()
+                    with gr.TabItem("📉 Timeline Chart"):
+                        plot_timeline_out = gr.Plot()
+                with gr.Accordion("Tailored CVE Analysis (Select a row in the table above)", open=False) as llm_accordion:
                     with gr.Row():
                         with gr.Column(scale=2):
+                            original_desc_out = gr.Textbox(label="Full Original CVE Description", lines=8, interactive=False)
                         with gr.Column(scale=1):
+                            audience_dd = gr.Dropdown(choices=list(AUDIENCE_PROFILES.keys()), label="Select Audience", value="Cybersecurity Professional")
+                            generate_btn = gr.Button("Generate Tailored Summary", variant="primary")
+                    summary_llm_out = gr.Markdown("*Your tailored summary will appear here...*")
+        # --- Event Handling Logic ---
         def on_year_change(year):
+            df = get_cve_dataframe(year)
+            return df, *analyze_and_visualize(df, "All", "All", "")
+        def on_select_cve(df: pd.DataFrame, evt: gr.SelectData):
+            if evt.value is None: return "", "", gr.update(visible=False)
+            full_description = df.iloc[evt.index[0]]["Description"]
+            return full_description, full_description, gr.update(visible=True)
         filter_inputs = [df_state, severity_dd, vector_dd, search_tb]
+        analysis_outputs = [table_out, plot_severity_out, plot_timeline_out, summary_out]
+        year_dd.change(fn=on_year_change, inputs=[year_dd], outputs=[df_state] + analysis_outputs)
+        dashboard.load(fn=on_year_change, inputs=[year_dd], outputs=[df_state] + analysis_outputs)
+        for control in [severity_dd, vector_dd, filter_btn, search_tb]:
+            event = control.click if isinstance(control, gr.Button) else (control.submit if isinstance(control, gr.Textbox) else control.change)
+            event(fn=analyze_and_visualize, inputs=filter_inputs, outputs=analysis_outputs)
+        table_out.select(fn=on_select_cve, inputs=[df_state], outputs=[selected_cve_description, original_desc_out, llm_accordion], show_progress="hidden")
+        generate_btn.click(fn=generate_tailored_summary, inputs=[selected_cve_description, audience_dd, hf_token_state], outputs=[summary_llm_out])
     return dashboard
 if __name__ == "__main__":
+    cve_dashboard = create_dashboard()
+    cve_dashboard.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 gradio
 pandas
 plotly
-requests
-urllib3

 gradio
 pandas
 plotly
+requests