Add Data Analyst agent with SQL query generation and update gitignore patterns

shakil1819 · shakil1819 · commit fa35b123f647 · 2025-04-03T03:02:33.000+06:00
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,9 @@ myenv/
 # C extensions
 *.so
 .env
-
+*.csv
+*.db
+*.Identifier
 # Distribution / packaging
 .Python
 build/
diff --git a/app/agent/da_agent.py b/app/agent/da_agent.py
@@ -0,0 +1,137 @@
+import contextlib  # Added for SIM105 fix
+import csv
+import json
+import tempfile
+
+import pandas as pd
+import streamlit as st
+
+from agno.models.openai import OpenAIChat  # type: ignore
+from agno.tools.pandas import PandasTools
+from phi.agent.duckdb import DuckDbAgent
+
+
+# Function to preprocess and save the uploaded file
+def preprocess_and_save(file):  # type: ignore
+    try:
+        # Read the uploaded file into a DataFrame
+        if file.name.endswith(".csv"):
+            df = pd.read_csv(file, encoding="utf-8", na_values=["NA", "N/A", "missing"])
+        elif file.name.endswith(".xlsx"):
+            df = pd.read_excel(file, na_values=["NA", "N/A", "missing"])
+        else:
+            st.error("Unsupported file format. Please upload a CSV or Excel file.")
+            return None, None, None
+
+        # Ensure string columns are properly quoted
+        for col in df.select_dtypes(include=["object"]):
+            df[col] = df[col].astype(str).replace({r'"': '""'}, regex=True)
+
+        # Parse dates and numeric columns
+        for col in df.columns:
+            if "date" in col.lower():
+                df[col] = pd.to_datetime(df[col], errors="coerce")
+            elif df[col].dtype == "object":
+                with contextlib.suppress(ValueError, TypeError):  # Fixed SIM105
+                    df[col] = pd.to_numeric(df[col])
+
+        # Create a temporary file to save the preprocessed data
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
+            temp_path = temp_file.name
+            # Save the DataFrame to the temporary CSV file with quotes around string fields
+            df.to_csv(temp_path, index=False, quoting=csv.QUOTE_ALL)
+
+        return temp_path, df.columns.tolist(), df  # Return the DataFrame as well
+    except Exception as e:
+        st.error(f"Error processing file: {e}")
+        return None, None, None
+
+
+# Streamlit app
+st.title("📊 Data Analyst Agent")
+
+# Sidebar for API keys
+with st.sidebar:
+    st.header("API Keys")
+    openai_key = st.text_input("Enter your OpenAI API key:", type="password")
+    if openai_key:
+        st.session_state.openai_key = openai_key
+        st.success("API key saved!")
+    else:
+        st.warning("Please enter your OpenAI API key to proceed.")
+
+# File upload widget
+uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"])
+
+if uploaded_file is not None and "openai_key" in st.session_state:
+    # Preprocess and save the uploaded file
+    temp_path, columns, df = preprocess_and_save(uploaded_file)  # type: ignore
+
+    if temp_path and columns and df is not None:
+        # Display the uploaded data as a table
+        st.write("Uploaded Data:")
+        st.dataframe(df)  # Use st.dataframe for an interactive table
+
+        # Display the columns of the uploaded data
+        st.write("Uploaded columns:", columns)
+
+        # Configure the semantic model with the temporary file path
+        semantic_model = {
+            "tables": [
+                {
+                    "name": "uploaded_data",
+                    "description": "Contains the uploaded dataset.",
+                    "path": temp_path,
+                }
+            ]
+        }
+
+        # Initialize the DuckDbAgent for SQL query generation
+        duckdb_agent = DuckDbAgent(
+            model=OpenAIChat(model="gpt-4", api_key=st.session_state.openai_key),  # type: ignore
+            semantic_model=json.dumps(semantic_model),
+            tools=[PandasTools()],  # type: ignore
+            markdown=True,
+            add_history_to_messages=False,  # Disable chat history
+            followups=False,  # Disable follow-up queries
+            read_tool_call_history=False,  # Disable reading tool call history
+            system_prompt=(
+                "You are an expert data analyst. Generate SQL queries to solve the user's query. "
+                "Return only the SQL query, enclosed in ```sql ``` and give the final answer."
+            ),  # Fixed E501 (line too long)
+        )
+
+        # Initialize code storage in session state
+        if "generated_code" not in st.session_state:
+            st.session_state.generated_code = None
+
+        # Main query input widget
+        user_query = st.text_area("Ask a query about the data:")
+
+        # Add info message about terminal output
+        st.info("💡 Check your terminal for a clearer output of the agent's response")
+
+        if st.button("Submit Query"):
+            if user_query.strip() == "":
+                st.warning("Please enter a query.")
+            else:
+                try:
+                    # Show loading spinner while processing
+                    with st.spinner("Processing your query..."):
+                        # Get the response from DuckDbAgent
+                        response1 = duckdb_agent.run(user_query)
+
+                        # Extract the content using a ternary operator (Fixed SIM108)
+                        response_content = response1.content if hasattr(response1, "content") else str(response1)
+
+                        response = duckdb_agent.print_response(  # type: ignore
+                            user_query,
+                            stream=True,
+                        )
+
+                    # Display the response in Streamlit
+                    st.markdown(response_content)
+
+                except Exception as e:
+                    st.error(f"Error generating response from the DuckDbAgent: {e}")
+                    st.error("Please try rephrasing your query or check if the data format is correct.")