|
| 1 | +import contextlib # Added for SIM105 fix |
| 2 | +import csv |
| 3 | +import json |
| 4 | +import tempfile |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +import streamlit as st |
| 8 | + |
| 9 | +from agno.models.openai import OpenAIChat # type: ignore |
| 10 | +from agno.tools.pandas import PandasTools |
| 11 | +from phi.agent.duckdb import DuckDbAgent |
| 12 | + |
| 13 | + |
| 14 | +# Function to preprocess and save the uploaded file |
| 15 | +def preprocess_and_save(file): # type: ignore |
| 16 | + try: |
| 17 | + # Read the uploaded file into a DataFrame |
| 18 | + if file.name.endswith(".csv"): |
| 19 | + df = pd.read_csv(file, encoding="utf-8", na_values=["NA", "N/A", "missing"]) |
| 20 | + elif file.name.endswith(".xlsx"): |
| 21 | + df = pd.read_excel(file, na_values=["NA", "N/A", "missing"]) |
| 22 | + else: |
| 23 | + st.error("Unsupported file format. Please upload a CSV or Excel file.") |
| 24 | + return None, None, None |
| 25 | + |
| 26 | + # Ensure string columns are properly quoted |
| 27 | + for col in df.select_dtypes(include=["object"]): |
| 28 | + df[col] = df[col].astype(str).replace({r'"': '""'}, regex=True) |
| 29 | + |
| 30 | + # Parse dates and numeric columns |
| 31 | + for col in df.columns: |
| 32 | + if "date" in col.lower(): |
| 33 | + df[col] = pd.to_datetime(df[col], errors="coerce") |
| 34 | + elif df[col].dtype == "object": |
| 35 | + with contextlib.suppress(ValueError, TypeError): # Fixed SIM105 |
| 36 | + df[col] = pd.to_numeric(df[col]) |
| 37 | + |
| 38 | + # Create a temporary file to save the preprocessed data |
| 39 | + with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file: |
| 40 | + temp_path = temp_file.name |
| 41 | + # Save the DataFrame to the temporary CSV file with quotes around string fields |
| 42 | + df.to_csv(temp_path, index=False, quoting=csv.QUOTE_ALL) |
| 43 | + |
| 44 | + return temp_path, df.columns.tolist(), df # Return the DataFrame as well |
| 45 | + except Exception as e: |
| 46 | + st.error(f"Error processing file: {e}") |
| 47 | + return None, None, None |
| 48 | + |
| 49 | + |
| 50 | +# Streamlit app |
| 51 | +st.title("📊 Data Analyst Agent") |
| 52 | + |
| 53 | +# Sidebar for API keys |
| 54 | +with st.sidebar: |
| 55 | + st.header("API Keys") |
| 56 | + openai_key = st.text_input("Enter your OpenAI API key:", type="password") |
| 57 | + if openai_key: |
| 58 | + st.session_state.openai_key = openai_key |
| 59 | + st.success("API key saved!") |
| 60 | + else: |
| 61 | + st.warning("Please enter your OpenAI API key to proceed.") |
| 62 | + |
| 63 | +# File upload widget |
| 64 | +uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"]) |
| 65 | + |
| 66 | +if uploaded_file is not None and "openai_key" in st.session_state: |
| 67 | + # Preprocess and save the uploaded file |
| 68 | + temp_path, columns, df = preprocess_and_save(uploaded_file) # type: ignore |
| 69 | + |
| 70 | + if temp_path and columns and df is not None: |
| 71 | + # Display the uploaded data as a table |
| 72 | + st.write("Uploaded Data:") |
| 73 | + st.dataframe(df) # Use st.dataframe for an interactive table |
| 74 | + |
| 75 | + # Display the columns of the uploaded data |
| 76 | + st.write("Uploaded columns:", columns) |
| 77 | + |
| 78 | + # Configure the semantic model with the temporary file path |
| 79 | + semantic_model = { |
| 80 | + "tables": [ |
| 81 | + { |
| 82 | + "name": "uploaded_data", |
| 83 | + "description": "Contains the uploaded dataset.", |
| 84 | + "path": temp_path, |
| 85 | + } |
| 86 | + ] |
| 87 | + } |
| 88 | + |
| 89 | + # Initialize the DuckDbAgent for SQL query generation |
| 90 | + duckdb_agent = DuckDbAgent( |
| 91 | + model=OpenAIChat(model="gpt-4", api_key=st.session_state.openai_key), # type: ignore |
| 92 | + semantic_model=json.dumps(semantic_model), |
| 93 | + tools=[PandasTools()], # type: ignore |
| 94 | + markdown=True, |
| 95 | + add_history_to_messages=False, # Disable chat history |
| 96 | + followups=False, # Disable follow-up queries |
| 97 | + read_tool_call_history=False, # Disable reading tool call history |
| 98 | + system_prompt=( |
| 99 | + "You are an expert data analyst. Generate SQL queries to solve the user's query. " |
| 100 | + "Return only the SQL query, enclosed in ```sql ``` and give the final answer." |
| 101 | + ), # Fixed E501 (line too long) |
| 102 | + ) |
| 103 | + |
| 104 | + # Initialize code storage in session state |
| 105 | + if "generated_code" not in st.session_state: |
| 106 | + st.session_state.generated_code = None |
| 107 | + |
| 108 | + # Main query input widget |
| 109 | + user_query = st.text_area("Ask a query about the data:") |
| 110 | + |
| 111 | + # Add info message about terminal output |
| 112 | + st.info("💡 Check your terminal for a clearer output of the agent's response") |
| 113 | + |
| 114 | + if st.button("Submit Query"): |
| 115 | + if user_query.strip() == "": |
| 116 | + st.warning("Please enter a query.") |
| 117 | + else: |
| 118 | + try: |
| 119 | + # Show loading spinner while processing |
| 120 | + with st.spinner("Processing your query..."): |
| 121 | + # Get the response from DuckDbAgent |
| 122 | + response1 = duckdb_agent.run(user_query) |
| 123 | + |
| 124 | + # Extract the content using a ternary operator (Fixed SIM108) |
| 125 | + response_content = response1.content if hasattr(response1, "content") else str(response1) |
| 126 | + |
| 127 | + response = duckdb_agent.print_response( # type: ignore |
| 128 | + user_query, |
| 129 | + stream=True, |
| 130 | + ) |
| 131 | + |
| 132 | + # Display the response in Streamlit |
| 133 | + st.markdown(response_content) |
| 134 | + |
| 135 | + except Exception as e: |
| 136 | + st.error(f"Error generating response from the DuckDbAgent: {e}") |
| 137 | + st.error("Please try rephrasing your query or check if the data format is correct.") |
0 commit comments