diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/README.md b/README.md index b29e89d..03e5635 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,20 @@ # pandas-LLM ## Introduction -pandas-llm is a lightweight Python library that extends pandas to allow querying datasets using OpenAI prompts. This powerful tool leverages the natural language processing capabilities of OpenAI to offer intuitive, language-based querying of your Pandas dataframes. +pandas-llm is a lightweight Python library that extends pandas to allow querying datasets using OpenAI prompts. This powerful tool leverages the natural language processing capabilities of OpenAI to offer intuitive, language-based querying of your Pandas dataframes with built-in validation and safety features. ## Key Features -- **Natural Language Querying**: With pandas-llm, you can execute complex Pandas queries using natural language prompts. Instead of writing code, you can express your query in plain language and obtain the desired results. +- **Natural Language Querying**: Execute complex Pandas queries using natural language prompts. Instead of writing code, express your query in plain language and obtain the desired results. -- **Data Privacy**: Your data is not sent on the Internet. Pandas-LLM works locally with your data and uses openAI to create the query based on the dataframe columns and data types, not its content. +- **Data Privacy**: Your data stays local. Pandas-LLM works with your data locally and uses OpenAI to create queries based on dataframe metadata (columns and data types), not its content. -- **Seamless Integration**: The library seamlessly integrates with your existing Pandas workflow. You can continue using normal Pandas functions and syntax while leveraging the added capability of natural language queries. +- **Query Validation**: Built-in validation ensures generated queries are safe and compatible with your data types, preventing common errors and ensuring reliable results. -- **Efficiency and Performance**: pandas-LLM is designed to deliver efficient and performant querying capabilities. It uses OpenAI's language model to process queries quickly and accurately, providing rapid insights from your data. +- **Safe Execution**: Uses RestrictedPython for sandboxed execution of generated queries, providing an additional layer of security. -- **Flexible and Expressive**: Whether you need to filter, aggregate, sort, or transform your data, pandas-LLM allows you to express your requirements flexibly and expressively. You can perform complex operations on your dataframes with ease using human-readable language. +- **Serializable Results**: Results are automatically converted to JSON-serializable formats, making it easy to store or transmit query results. -- **Intelligent Results**: The library returns the results of your queries in a concise and understandable format. You can extract valuable insights from your data without complex code or manual analysis. - -With pandas-llm, you can unlock the power of natural language querying and effortlessly execute complex pandas queries. Let the library handle the intricacies of data manipulation while you focus on gaining insights and making data-driven decisions. +- **Type-Safe Operations**: Intelligent handling of different data types including strings, numbers, dates, and boolean values with appropriate null value handling. ## Installation @@ -26,77 +24,90 @@ Install pandas-llm using pip: pip install pandas-llm ``` -## Features -- Query pandas dataframes using natural language prompts. -- Leverage the power of OpenAI's language models in your data analysis. -- Seamless integration with existing pandas functions. - ## Usage -Here's a quick [example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example.py) of how to use pandas-llm: +Here's a basic example of how to use pandas-llm: ```python -import os +import json import pandas as pd -from pandas_llm import PandasLLM - -# Data -# Please note that these names, ages, and donations are randomly generated -# and do not correspond to real individuals or their donations. -data = [('John Doe', 25, 50), - ('Jane Smith', 38, 70), - ('Alex Johnson', 45, 80), - ('Jessica Brown', 60, 40), - ('Michael Davis', 22, 90), - ('Emily Wilson', 30, 60), - ('Daniel Taylor', 35, 75), - ('Sophia Moore', 40, 85), - ('David Thomas', 50, 65), - ('Olivia Jackson', 29, 55)] -df = pd.DataFrame(data, columns=['name', 'age', 'donation']) - -conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) -result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?") -code = conv_df.code_block - -print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") -# Executing the following expression of type : -# result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean() - -# Result is: -# 72.5 +from src.pandas_query import PandasQuery -``` +# Create sample DataFrame +df = pd.read_csv("customers-100.csv") -There is also a chatbot available in the repository using the same dataset. -Look at [Chatbot example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example-chatbot.py) +# Create query executor +querier = PandasQuery(validate=True, temperature=0) -## PandasLLM Class Constructor +# Execute query +try: + # query = "Get a table of all customers who have a first name beginning with 'D' and who live in a city with exactly two e's in it?" + # query = "Get a subtable of people who live in Panama" + query = "Get a subtable of people whos surname backwards is: 'nosdodn' or 'atam'" + result = querier.execute(df, query) -The constructor for the PandasLLM class has been enhanced in this release to provide more flexibility and control over the language model interaction. The constructor accepts the following arguments: + # Get complete results as a dictionary + result_dict = result.model_dump() + print("\nComplete results:") + print(json.dumps(result_dict, indent=2)) -**data** (mandatory): The data to be used. It can be a Pandas DataFrame, a list of lists, tuples, dictionaries, a dictionary, a string, or a list. + # df of results + print('\nHere is a table of the output results:\n') + df_result = pd.DataFrame(result.result) + print(df_result) -**llm_engine** (optional): The name of the LLM engine to use. Currently, only OpenAI is supported. Defaults to "openai". +except Exception as e: + print(f"Error executing query: {str(e)}") +``` -**llm_params** (optional): A dictionary of parameters to be used with the OpenAI API. This allows customization of the LLM behavior. Defaults to model=gpt-3.5-turbo and temperature=0.2. +## Query Result Structure +The library returns a QueryResult object with the following attributes: -**prompt_override** (optional): A boolean that determines whether or not the prompt is overridden. If set to True, the custom prompt becomes the main prompt. Defaults to False. +```python +{ + "query": str, # Original natural language query + "code": str, # Generated pandas code + "is_valid": bool, # Whether the query passed validation + "errors": List[str], # Any validation or execution errors + "result": Any # Query results (automatically serialized) +} +``` -**custom_prompt** (optional): A string that can be provided if prompt_override is False. The custom prompt will be added to the default pandas_llm prompt. Defaults to an empty string. +## Supported Operations +The library supports a wide range of pandas operations: -**path** (optional): The path to the file where the debug data will be saved. If not specified, debug data files will not be generated. +### String Operations +- Basic: contains, startswith, endswith, lower, upper, strip +- Count/Match: count, match, extract, find, findall +- Transform: replace, pad, center, slice, split -**verbose** (optional): A boolean determines whether debugging information will be printed. If set to True, additional debugging info will be displayed. Defaults to False. +### Numeric Operations +- Comparisons: >, <, >=, <=, ==, != +- Aggregations (with groupby): sum, mean, median, min, max, count -**data_privacy** (optional): A boolean determines whether the data is treated as private. If set to True, the function will not send the data content to OpenAI. Defaults to True. +### Date Operations +- Attributes: year, month, day +- Comparisons: >, <, >=, <=, ==, != -**llm_api_key** (optional): The OpenAI API key to be used. The library will attempt to use the default API key configured if not provided. +### Advanced Features +- Automatic null handling appropriate to data type +- Type-safe operations with proper conversions +- Multi-condition filtering with proper parentheses +- Case-sensitive and case-insensitive string operations -**force_sandbox** (optional): A boolean determining the fallback behaviour if the sandbox environment fails. If set to False and the sandbox fails, the library will retry using eval, which is less safe. Defaults to False. +## Configuration +The PandasQuery constructor accepts the following parameters: +```python +PandasQuery( + model: str = "gpt-4", # OpenAI model to use + temperature: float = 0.2, # Temperature for query generation + api_key: Optional[str] = None, # OpenAI API key + validate: bool = True # Enable/disable query validation +) +``` ## Contributing Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. Please make sure to update tests as appropriate. ## License -MIT +MIT \ No newline at end of file diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..03457e4 --- /dev/null +++ b/examples/__init__.py @@ -0,0 +1,32 @@ +# examples/example.py +import os +import pandas as pd +import sys +from pathlib import Path + +# Add the project root to the Python path +project_root = Path(__file__).resolve().parent.parent +sys.path.append(str(project_root)) + +from src.pandas_query import PandasQuery + +# Data +data = [ + ('John Doe', 25, 50), + ('Jane Smith', 38, 70), + ('Alex Johnson', 45, 80), + ('Jessica Brown', 60, 40), + ('Michael Davis', 22, 90), +] +df = pd.DataFrame(data, columns=['name', 'age', 'donation']) + +# Create query executor +querier = PandasQuery() + +# Execute query +query = "What is the average donation of people older than 40 who donated more than $50?" +result = querier.execute(df, query) + +print(f"Query: {query}") +print(f"Generated code: {querier.last_code}") +print(f"Result: {result}") \ No newline at end of file diff --git a/examples/customers-100.csv b/examples/customers-100.csv new file mode 100644 index 0000000..635248c --- /dev/null +++ b/examples/customers-100.csv @@ -0,0 +1,101 @@ +Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website +1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/ +2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/ +3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/ +4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/ +5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/ +6,2d08FB17EE273F4,Aimee,Downs,Steele Group,Chavezborough,Bosnia and Herzegovina,(283)437-3886x88321,999-728-1637,louis27@gilbert.com,2020-02-25,http://www.berger.net/ +7,EA4d384DfDbBf77,Darren,Peck,"Lester, Woodard and Mitchell",Lake Ana,Pitcairn Islands,(496)452-6181x3291,+1-247-266-0963x4995,tgates@cantrell.com,2021-08-24,https://www.le.com/ +8,0e04AFde9f225dE,Brett,Mullen,"Sanford, Davenport and Giles",Kimport,Bulgaria,001-583-352-7197x297,001-333-145-0369,asnow@colon.com,2021-04-12,https://hammond-ramsey.com/ +9,C2dE4dEEc489ae0,Sheryl,Meyers,Browning-Simon,Robersonstad,Cyprus,854-138-4911x5772,+1-448-910-2276x729,mariokhan@ryan-pope.org,2020-01-13,https://www.bullock.net/ +10,8C2811a503C7c5a,Michelle,Gallagher,Beck-Hendrix,Elaineberg,Timor-Leste,739.218.2516x459,001-054-401-0347x617,mdyer@escobar.net,2021-11-08,https://arias.com/ +11,216E205d6eBb815,Carl,Schroeder,"Oconnell, Meza and Everett",Shannonville,Guernsey,637-854-0256x825,114.336.0784x788,kirksalas@webb.com,2021-10-20,https://simmons-hurley.com/ +12,CEDec94deE6d69B,Jenna,Dodson,"Hoffman, Reed and Mcclain",East Andrea,Vietnam,(041)737-3846,+1-556-888-3485x42608,mark42@robbins.com,2020-11-29,http://www.douglas.net/ +13,e35426EbDEceaFF,Tracey,Mata,Graham-Francis,South Joannamouth,Togo,001-949-844-8787,(855)713-8773,alex56@walls.org,2021-12-02,http://www.beck.com/ +14,A08A8aF8BE9FaD4,Kristine,Cox,Carpenter-Cook,Jodyberg,Sri Lanka,786-284-3358x62152,+1-315-627-1796x8074,holdenmiranda@clarke.com,2021-02-08,https://www.brandt.com/ +15,6fEaA1b7cab7B6C,Faith,Lutz,Carter-Hancock,Burchbury,Singapore,(781)861-7180x8306,207-185-3665,cassieparrish@blevins-chapman.net,2022-01-26,http://stevenson.org/ +16,8cad0b4CBceaeec,Miranda,Beasley,Singleton and Sons,Desireeshire,Oman,540.085.3135x185,+1-600-462-6432x21881,vduncan@parks-hardy.com,2022-04-12,http://acosta.org/ +17,a5DC21AE3a21eaA,Caroline,Foley,Winters-Mendoza,West Adriennestad,Western Sahara,936.222.4746x9924,001-469-948-6341x359,holtgwendolyn@watson-davenport.com,2021-03-10,http://www.benson-roth.com/ +18,F8Aa9d6DfcBeeF8,Greg,Mata,Valentine LLC,Lake Leslie,Mozambique,(701)087-2415,(195)156-1861x26241,jaredjuarez@carroll.org,2022-03-26,http://pitts-cherry.com/ +19,F160f5Db3EfE973,Clifford,Jacobson,Simon LLC,Harmonview,South Georgia and the South Sandwich Islands,001-151-330-3524x0469,(748)477-7174,joseph26@jacobson.com,2020-09-24,https://mcconnell.com/ +20,0F60FF3DdCd7aB0,Joanna,Kirk,Mays-Mccormick,Jamesshire,French Polynesia,(266)131-7001x711,(283)312-5579x11543,tuckerangie@salazar.net,2021-09-24,https://www.camacho.net/ +21,9F9AdB7B8A6f7F2,Maxwell,Frye,Patterson Inc,East Carly,Malta,423.262.3059,202-880-0688x7491,fgibson@drake-webb.com,2022-01-12,http://www.roberts.com/ +22,FBd0Ded4F02a742,Kiara,Houston,"Manning, Hester and Arroyo",South Alvin,Netherlands,001-274-040-3582x10611,+1-528-175-0973x4684,blanchardbob@wallace-shannon.com,2020-09-15,https://www.reid-potts.com/ +23,2FB0FAA1d429421,Colleen,Howard,Greer and Sons,Brittanyview,Paraguay,1935085151,(947)115-7711x5488,rsingleton@ryan-cherry.com,2020-08-19,http://paul.biz/ +24,010468dAA11382c,Janet,Valenzuela,Watts-Donaldson,Veronicamouth,Lao People's Democratic Republic,354.259.5062x7538,500.433.2022,stefanie71@spence.com,2020-09-08,https://moreno.biz/ +25,eC1927Ca84E033e,Shane,Wilcox,Tucker LLC,Bryanville,Albania,(429)005-9030x11004,541-116-4501,mariah88@santos.com,2021-04-06,https://www.ramos.com/ +26,09D7D7C8Fe09aea,Marcus,Moody,Giles Ltd,Kaitlyntown,Panama,674-677-8623,909-277-5485x566,donnamullins@norris-barrett.org,2022-05-24,https://www.curry.com/ +27,aBdfcF2c50b0bfD,Dakota,Poole,Simmons Group,Michealshire,Belarus,(371)987-8576x4720,071-152-1376,stacey67@fields.org,2022-02-20,https://sanford-wilcox.biz/ +28,b92EBfdF8a3f0E6,Frederick,Harper,"Hinton, Chaney and Stokes",South Marissatown,Switzerland,+1-077-121-1558x0687,264.742.7149,jacobkhan@bright.biz,2022-05-26,https://callahan.org/ +29,3B5dAAFA41AFa22,Stefanie,Fitzpatrick,Santana-Duran,Acevedoville,Saint Vincent and the Grenadines,(752)776-3286,+1-472-021-4814x85074,wterrell@clark.com,2020-07-30,https://meyers.com/ +30,EDA69ca7a6e96a2,Kent,Bradshaw,Sawyer PLC,North Harold,Tanzania,+1-472-143-5037x884,126.922.6153,qjimenez@boyd.com,2020-04-26,http://maynard-ho.com/ +31,64DCcDFaB9DFd4e,Jack,Tate,"Acosta, Petersen and Morrow",West Samuel,Zimbabwe,965-108-4406x20714,046.906.1442x6784,gfigueroa@boone-zavala.com,2021-09-15,http://www.hawkins-ramsey.com/ +32,679c6c83DD872d6,Tom,Trujillo,Mcgee Group,Cunninghamborough,Denmark,416-338-3758,(775)890-7209,tapiagreg@beard.info,2022-01-13,http://www.daniels-klein.com/ +33,7Ce381e4Afa4ba9,Gabriel,Mejia,Adkins-Salinas,Port Annatown,Liechtenstein,4077245425,646.044.0696x66800,coleolson@jennings.net,2021-04-24,https://patel-hanson.info/ +34,A09AEc6E3bF70eE,Kaitlyn,Santana,Herrera Group,New Kaitlyn,United States of America,6303643286,447-710-6202x07313,georgeross@miles.org,2021-09-21,http://pham.com/ +35,aA9BAFfBc3710fe,Faith,Moon,"Waters, Chase and Aguilar",West Marthaburgh,Bahamas,+1-586-217-0359x6317,+1-818-199-1403,willistonya@randolph-baker.com,2021-11-03,https://spencer-charles.info/ +36,E11dfb2DB8C9f72,Tammie,Haley,"Palmer, Barnes and Houston",East Teresa,Belize,001-276-734-4113x6087,(430)300-8770,harrisisaiah@jenkins.com,2022-01-04,http://evans-simon.com/ +37,889eCf90f68c5Da,Nicholas,Sosa,Jordan Ltd,South Hunter,Uruguay,(661)425-6042,975-998-1519,fwolfe@dorsey.com,2021-08-10,https://www.fleming-richards.com/ +38,7a1Ee69F4fF4B4D,Jordan,Gay,Glover and Sons,South Walter,Solomon Islands,7208417020,8035336772,tiffanydavies@harris-mcfarland.org,2021-02-24,http://www.lee.org/ +39,dca4f1D0A0fc5c9,Bruce,Esparza,Huerta-Mclean,Poolefurt,Montenegro,559-529-4424,001-625-000-7132x0367,preese@frye-vega.com,2021-10-22,http://www.farley.org/ +40,17aD8e2dB3df03D,Sherry,Garza,Anderson Ltd,West John,Poland,001-067-713-6440x158,(978)289-8785x5766,ann48@miller.com,2021-11-01,http://spence.com/ +41,2f79Cd309624Abb,Natalie,Gentry,Monroe PLC,West Darius,Dominican Republic,830.996.8238,499.122.5415,tcummings@fitzpatrick-ashley.com,2020-10-10,http://www.dorsey.biz/ +42,6e5ad5a5e2bB5Ca,Bryan,Dunn,Kaufman and Sons,North Jimstad,Burkina Faso,001-710-802-5565,078.699.8982x13881,woodwardandres@phelps.com,2021-09-08,http://www.butler.com/ +43,7E441b6B228DBcA,Wayne,Simpson,Perkins-Trevino,East Rebekahborough,Bolivia,(344)156-8632x1869,463-445-3702x38463,barbarapittman@holder.com,2020-12-13,https://gillespie-holder.com/ +44,D3fC11A9C235Dc6,Luis,Greer,Cross PLC,North Drew,Bulgaria,001-336-025-6849x701,684.698.2911x6092,bstuart@williamson-mcclure.com,2022-05-15,https://fletcher-nielsen.com/ +45,30Dfa48fe5Ede78,Rhonda,Frost,"Herrera, Shepherd and Underwood",Lake Lindaburgh,Monaco,(127)081-9339,+1-431-028-3337x3492,zkrueger@wolf-chavez.net,2021-12-06,http://www.khan.com/ +46,fD780ED8dbEae7B,Joanne,Montes,"Price, Sexton and Mcdaniel",Gwendolynview,Palau,(897)726-7952,(467)886-9467x5721,juan80@henson.net,2020-07-01,http://ochoa.com/ +47,300A40d3ce24bBA,Geoffrey,Guzman,Short-Wiggins,Zimmermanland,Uzbekistan,975.235.8921x269,(983)188-6873,bauercrystal@gay.com,2020-04-23,https://decker-kline.com/ +48,283DFCD0Dba40aF,Gloria,Mccall,"Brennan, Acosta and Ramos",North Kerriton,Ghana,445-603-6729,001-395-959-4736x4524,bartlettjenna@zuniga-moss.biz,2022-03-11,http://burgess-frank.com/ +49,F4Fc91fEAEad286,Brady,Cohen,Osborne-Erickson,North Eileenville,United Arab Emirates,741.849.0139x524,+1-028-691-7497x0894,mccalltyrone@durham-rose.biz,2022-03-10,http://hammond-barron.com/ +50,80F33Fd2AcebF05,Latoya,Mccann,"Hobbs, Garrett and Sanford",Port Sergiofort,Belarus,(530)287-4548x29481,162-234-0249x32790,bobhammond@barry.biz,2021-12-02,https://www.burton.com/ +51,Aa20BDe68eAb0e9,Gerald,Hawkins,"Phelps, Forbes and Koch",New Alberttown,Canada,+1-323-239-1456x96168,(092)508-0269,uwarner@steele-arias.com,2021-03-19,https://valenzuela.com/ +52,e898eEB1B9FE22b,Samuel,Crawford,"May, Goodwin and Martin",South Jasmine,Algeria,802-242-7457,626.116.9535x8578,xpittman@ritter-carney.net,2021-03-27,https://guerrero.org/ +53,faCEF517ae7D8eB,Patricia,Goodwin,"Christian, Winters and Ellis",Cowanfort,Swaziland,322.549.7139x70040,(111)741-4173,vaughanchristy@lara.biz,2021-03-08,http://clark.info/ +54,c09952De6Cda8aA,Stacie,Richard,Byrd Inc,New Deborah,Madagascar,001-622-948-3641x24810,001-731-168-2893x8891,clinton85@colon-arias.org,2020-10-15,https://kim.com/ +55,f3BEf3Be028166f,Robin,West,"Nixon, Blackwell and Sosa",Wallstown,Ecuador,698.303.4267,001-683-837-7651x525,greenemiranda@zimmerman.com,2022-01-13,https://www.mora.com/ +56,C6F2Fc6a7948a4e,Ralph,Haas,Montes PLC,Lake Ellenchester,Palestinian Territory,2239271999,001-962-434-0867x649,goodmancesar@figueroa.biz,2020-05-25,http://may.com/ +57,c8FE57cBBdCDcb2,Phyllis,Maldonado,Costa PLC,Lake Whitney,Saint Barthelemy,4500370767,001-508-064-6725x017,yhanson@warner-diaz.org,2021-01-25,http://www.bernard.com/ +58,B5acdFC982124F2,Danny,Parrish,Novak LLC,East Jaredbury,United Arab Emirates,(669)384-8597x8794,506.731.5952x571,howelldarren@house-cohen.com,2021-03-17,http://www.parsons-hudson.com/ +59,8c7DdF10798bCC3,Kathy,Hill,"Moore, Mccoy and Glass",Selenabury,South Georgia and the South Sandwich Islands,001-171-716-2175x310,888.625.0654,ncamacho@boone-simmons.org,2020-11-15,http://hayden.com/ +60,C681dDd0cc422f7,Kelli,Hardy,Petty Ltd,Huangfort,Sao Tome and Principe,020.324.2191x2022,424-157-8216,kristopher62@oliver.com,2020-12-20,http://www.kidd.com/ +61,a940cE42e035F28,Lynn,Pham,"Brennan, Camacho and Tapia",East Pennyshire,Portugal,846.468.6834x611,001-248-691-0006,mpham@rios-guzman.com,2020-08-21,https://www.murphy.com/ +62,9Cf5E6AFE0aeBfd,Shelley,Harris,"Prince, Malone and Pugh",Port Jasminborough,Togo,423.098.0315x8373,+1-386-458-8944x15194,zachary96@mitchell-bryant.org,2020-12-10,https://www.ryan.com/ +63,aEcbe5365BbC67D,Eddie,Jimenez,Caldwell Group,West Kristine,Ethiopia,+1-235-657-1073x6306,(026)401-7353x2417,kristiwhitney@bernard.com,2022-03-24,http://cherry.com/ +64,FCBdfCEAe20A8Dc,Chloe,Hutchinson,Simon LLC,South Julia,Netherlands,981-544-9452,+1-288-552-4666x060,leah85@sutton-terrell.com,2022-05-15,https://mitchell.info/ +65,636cBF0835E10ff,Eileen,Lynch,"Knight, Abbott and Hubbard",Helenborough,Liberia,+1-158-951-4131x53578,001-673-779-6713x680,levigiles@vincent.com,2021-01-02,http://mckay.com/ +66,fF1b6c9E8Fbf1ff,Fernando,Lambert,Church-Banks,Lake Nancy,Lithuania,497.829.9038,3863743398,fisherlinda@schaefer.net,2021-04-23,https://www.vang.com/ +67,2A13F74EAa7DA6c,Makayla,Cannon,Henderson Inc,Georgeport,New Caledonia,001-215-801-6392x46009,027-609-6460,scottcurtis@hurley.biz,2020-01-20,http://www.velazquez.net/ +68,a014Ec1b9FccC1E,Tom,Alvarado,Donaldson-Dougherty,South Sophiaberg,Kiribati,(585)606-2980x2258,730-797-3594x5614,nicholsonnina@montgomery.info,2020-08-18,http://odom-massey.com/ +69,421a109cABDf5fa,Virginia,Dudley,Warren Ltd,Hartbury,French Southern Territories,027.846.3705x14184,+1-439-171-1846x4636,zvalencia@phelps.com,2021-01-31,http://hunter-esparza.com/ +70,CC68FD1D3Bbbf22,Riley,Good,Wade PLC,Erikaville,Canada,6977745822,855-436-7641,alex06@galloway.com,2020-02-03,http://conway.org/ +71,CBCd2Ac8E3eBDF9,Alexandria,Buck,Keller-Coffey,Nicolasfort,Iran,078-900-4760x76668,414-112-8700x68751,lee48@manning.com,2021-02-20,https://ramsey.org/ +72,Ef859092FbEcC07,Richard,Roth,Conway-Mcbride,New Jasmineshire,Morocco,581-440-6539,9857827463,aharper@maddox-townsend.org,2020-02-23,https://www.brooks.com/ +73,F560f2d3cDFb618,Candice,Keller,Huynh and Sons,East Summerstad,Zimbabwe,001-927-965-8550x92406,001-243-038-4271x53076,buckleycory@odonnell.net,2020-08-22,https://www.lucero.com/ +74,A3F76Be153Df4a3,Anita,Benson,Parrish Ltd,Skinnerport,Russian Federation,874.617.5668x69878,(399)820-6418x0071,angie04@oconnell.com,2020-02-09,http://oconnor.com/ +75,D01Af0AF7cBbFeA,Regina,Stein,Guzman-Brown,Raystad,Solomon Islands,001-469-848-0724x4407,001-085-360-4426x00357,zrosario@rojas-hardin.net,2022-01-15,http://www.johnston.info/ +76,d40e89dCade7b2F,Debra,Riddle,"Chang, Aguirre and Leblanc",Colinhaven,United States Virgin Islands,+1-768-182-6014x14336,(303)961-4491,shieldskerry@robles.com,2020-07-11,http://kaiser.info/ +77,BF6a1f9bd1bf8DE,Brittany,Zuniga,Mason-Hester,West Reginald,Kyrgyz Republic,(050)136-9025,001-480-851-2496x0157,mchandler@cochran-huerta.org,2021-07-24,http://www.boyle.com/ +78,FfaeFFbbbf280db,Cassidy,Mcmahon,"Mcguire, Huynh and Hopkins",Lake Sherryborough,Myanmar,5040771311,684-682-0021x1326,katrinalane@fitzgerald.com,2020-10-21,https://hurst.com/ +79,CbAE1d1e9a8dCb1,Laurie,Pennington,"Sanchez, Marsh and Hale",Port Katherineville,Dominica,007.155.3406x553,+1-809-862-5566x277,cookejill@powell.com,2020-06-08,http://www.hebert.com/ +80,A7F85c1DE4dB87f,Alejandro,Blair,"Combs, Waller and Durham",Thomasland,Iceland,(690)068-4641x51468,555.509.8691x2329,elizabethbarr@ewing.com,2020-09-19,https://mercado-blevins.com/ +81,D6CEAfb3BDbaa1A,Leslie,Jennings,Blankenship-Arias,Coreybury,Micronesia,629.198.6346,075.256.0829,corey75@wiggins.com,2021-11-13,https://www.juarez.com/ +82,Ebdb6F6F7c90b69,Kathleen,Mckay,"Coffey, Lamb and Johnson",Lake Janiceton,Saint Vincent and the Grenadines,(733)910-9968,(691)247-4128x0665,chloelester@higgins-wilkinson.com,2021-09-12,http://www.owens-mooney.com/ +83,E8E7e8Cfe516ef0,Hunter,Moreno,Fitzpatrick-Lawrence,East Clinton,Isle of Man,(733)833-6754,001-761-013-7121,isaac26@benton-finley.com,2020-12-28,http://walls.info/ +84,78C06E9b6B3DF20,Chad,Davidson,Garcia-Jimenez,South Joshuashire,Oman,8275702958,(804)842-4715,justinwalters@jimenez.com,2021-11-15,http://www.garner-oliver.com/ +85,03A1E62ADdeb31c,Corey,Holt,"Mcdonald, Bird and Ramirez",New Glenda,Fiji,001-439-242-4986x7918,3162708934,maurice46@morgan.com,2020-02-18,http://www.watson.com/ +86,C6763c99d0bd16D,Emma,Cunningham,Stephens Inc,North Jillianview,New Zealand,128-059-0206x60217,(312)164-4545x2284,walter83@juarez.org,2022-05-13,http://www.reid.info/ +87,ebe77E5Bf9476CE,Duane,Woods,Montoya-Miller,Lyonsberg,Maldives,(636)544-7783x7288,(203)287-1003x5932,kmercer@wagner.com,2020-07-21,http://murray.org/ +88,E4Bbcd8AD81fC5f,Alison,Vargas,"Vaughn, Watts and Leach",East Cristinabury,Benin,365-273-8144,053-308-7653x6287,vcantu@norton.com,2020-11-10,http://mason.info/ +89,efeb73245CDf1fF,Vernon,Kane,Carter-Strickland,Thomasfurt,Yemen,114-854-1159x555,499-608-4612,hilljesse@barrett.info,2021-04-15,http://www.duffy-hensley.net/ +90,37Ec4B395641c1E,Lori,Flowers,Decker-Mcknight,North Joeburgh,Namibia,679.415.1210,945-842-3659x4581,tyrone77@valenzuela.info,2021-01-09,http://www.deleon-crosby.com/ +91,5ef6d3eefdD43bE,Nina,Chavez,Byrd-Campbell,Cassidychester,Bhutan,053-344-3205,+1-330-920-5422x571,elliserica@frank.com,2020-03-26,https://www.pugh.com/ +92,98b3aeDcC3B9FF3,Shane,Foley,Rocha-Hart,South Dannymouth,Hungary,+1-822-569-0302,001-626-114-5844x55073,nsteele@sparks.com,2021-07-06,https://www.holt-sparks.com/ +93,aAb6AFc7AfD0fF3,Collin,Ayers,Lamb-Peterson,South Lonnie,Anguilla,404-645-5351x012,001-257-582-8850x8516,dudleyemily@gonzales.biz,2021-06-29,http://www.ruiz.com/ +94,54B5B5Fe9F1B6C5,Sherry,Young,"Lee, Lucero and Johnson",Frankchester,Solomon Islands,158-687-1764,(438)375-6207x003,alan79@gates-mclaughlin.com,2021-04-04,https://travis.net/ +95,BE91A0bdcA49Bbc,Darrell,Douglas,"Newton, Petersen and Mathis",Daisyborough,Mali,001-084-845-9524x1777,001-769-564-6303,grayjean@lowery-good.com,2022-02-17,https://banks.biz/ +96,cb8E23e48d22Eae,Karl,Greer,Carey LLC,East Richard,Guyana,(188)169-1674x58692,001-841-293-3519x614,hhart@jensen.com,2022-01-30,http://hayes-perez.com/ +97,CeD220bdAaCfaDf,Lynn,Atkinson,"Ware, Burns and Oneal",New Bradview,Sri Lanka,+1-846-706-2218,605.413.3198,vkemp@ferrell.com,2021-07-10,https://novak-allison.com/ +98,28CDbC0dFe4b1Db,Fred,Guerra,Schmitt-Jones,Ortegaland,Solomon Islands,+1-753-067-8419x7170,+1-632-666-7507x92121,swagner@kane.org,2021-09-18,https://www.ross.com/ +99,c23d1D9EE8DEB0A,Yvonne,Farmer,Fitzgerald-Harrell,Lake Elijahview,Aruba,(530)311-9786,001-869-452-0943x12424,mccarthystephen@horn-green.biz,2021-08-11,http://watkins.info/ +100,2354a0E336A91A1,Clarence,Haynes,"Le, Nash and Cross",Judymouth,Honduras,(753)813-6941,783.639.1472,colleen91@faulkner.biz,2020-03-11,http://www.hatfield-saunders.net/ diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 0000000..ba55d18 --- /dev/null +++ b/examples/example.py @@ -0,0 +1,77 @@ +import json +import pandas as pd +from src.pandas_query import PandasQuery + +# Create sample DataFrame +df = pd.read_csv("customers-100.csv") + +# Create query executor +querier = PandasQuery(validate=True, temperature=0) + +# Execute query +try: + # query = "Get a table of all customers who have a first name beginning with 'D' and who live in a city with exactly two e's in it?" + # query = "Get a subtable of people who live in Panama" + query = "Get a subtable of people whos surname backwards is: 'nosdodn' or 'atam'" + result = querier.execute(df, query) + + # Get complete results as a dictionary + result_dict = result.model_dump() + print("\nComplete results:") + print(json.dumps(result_dict, indent=2)) + + # df of results + print('\nHere is a table of the output results:\n') + df_result = pd.DataFrame(result.result) + print(df_result) + +except Exception as e: + print(f"Error executing query: {str(e)}") + +""" +Complete results: +{ + "query": "Get a subtable of people whos surname backwards is: 'nosdodn' or 'atam'", + "code": "result = df[df['Last Name'].fillna('').str[::-1].str.lower().isin(['nosdodn', 'atam'])]", + "is_valid": true, + "errors": [], + "result": [ + { + "Index": 13, + "Customer Id": "e35426EbDEceaFF", + "First Name": "Tracey", + "Last Name": "Mata", + "Company": "Graham-Francis", + "City": "South Joannamouth", + "Country": "Togo", + "Phone 1": "001-949-844-8787", + "Phone 2": "(855)713-8773", + "Email": "alex56@walls.org", + "Subscription Date": "2021-12-02", + "Website": "http://www.beck.com/" + }, + { + "Index": 18, + "Customer Id": "F8Aa9d6DfcBeeF8", + "First Name": "Greg", + "Last Name": "Mata", + "Company": "Valentine LLC", + "City": "Lake Leslie", + "Country": "Mozambique", + "Phone 1": "(701)087-2415", + "Phone 2": "(195)156-1861x26241", + "Email": "jaredjuarez@carroll.org", + "Subscription Date": "2022-03-26", + "Website": "http://pitts-cherry.com/" + } + ] +} + +Here is a table of the output results: + + Index Customer Id ... Subscription Date Website +12 13 e35426EbDEceaFF ... 2021-12-02 http://www.beck.com/ +17 18 F8Aa9d6DfcBeeF8 ... 2022-03-26 http://pitts-cherry.com/ + +[2 rows x 12 columns] +""" \ No newline at end of file diff --git a/pandas_llm/__init__.py b/pandas_llm/__init__.py deleted file mode 100644 index ac54fb2..0000000 --- a/pandas_llm/__init__.py +++ /dev/null @@ -1,359 +0,0 @@ -import pandas as pd -import datetime -import numpy as np -import openai -import os -import re -import json - -# sandbox.py -from RestrictedPython import compile_restricted -from RestrictedPython.Guards import safe_builtins,guarded_iter_unpack_sequence -from RestrictedPython.Eval import default_guarded_getattr, default_guarded_getitem, default_guarded_getiter -import pandas as pd - -class Sandbox: - def __init__(self): - self._allowed_imports = {} - - def allow_import(self, module_name): - try: - module = __import__(module_name) - self._allowed_imports[module_name] = module - except ImportError: - pass - - def execute(self, code, local_vars = {}): - allowed_builtins = safe_builtins - # Add __builtins__, __import__, and allowed imports to the globals - restricted_globals = {"__builtins__": allowed_builtins} - restricted_globals.update(self._allowed_imports) - - builtin_mappings = { - "__import__": __import__, - "_getattr_": default_guarded_getattr, - "_getitem_": default_guarded_getitem, - "_getiter_": default_guarded_getiter, - "_iter_unpack_sequence_": guarded_iter_unpack_sequence, - "list": list, - "set": set, - "pd": pd, - } - - series_methods = [ - "sum", "mean", "any", "argmax", "argmin", "count", "cumsum", "cumprod", "diff", - "dropna", "fillna", "head", "idxmax", "idxmin", "last", "max", "min", "notna", - "prod", "quantile", "rename", "round", "tail", "to_frame", "to_list", "to_numpy", - "to_string","unique", "sort_index", "sort_values", "aggregate" - ] - - - builtin_mappings.update({method: getattr(pd.Series, method) for method in series_methods}) - - restricted_globals["__builtins__"].update(builtin_mappings) - - byte_code = compile_restricted(source=code, filename='', mode='exec') - - # Execute the restricted code - exec(byte_code, restricted_globals, local_vars) - - return local_vars - - -class PandasLLM(pd.DataFrame): - """ - PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a - wrapper around the OpenAI API. - """ - - code_blocks = [r'```python(.*?)```',r'```(.*?)```'] - - llm_default_model = "gpt-3.5-turbo" - llm_default_temperature = 0.2 - llm_engine = "openai" - llm_default_params = { "model": llm_default_model, - "temperature": llm_default_temperature} - llm_api_key = None - - prompt_override = False - custom_prompt = "" - data_privacy = True - path = None - verbose = False - code_block = "" - force_sandbox = False - def __init__(self, - data, - llm_engine:str = "openai", llm_params=llm_default_params, - prompt_override:bool = False, - custom_prompt:str = "", - path:str = None, - verbose:bool = False, - data_privacy:bool = True, - llm_api_key:str = None, - force_sandbox:bool = False, - *args, **kwargs): - """ - This is the constructor for the PandasLLM class. It takes in the following arguments: - data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples, - a list of dictionaries, a dictionary, a string, or a list. - llm_engine: The name of the OpenAI engine to use. - llm_params: A dictionary of parameters to be used with the OpenAI API. - prompt_override: A boolean that determines whether or not the prompt is overridden. - custom_prompt: A string that overrides the prompt. - path: The path to the file to be used. - verbose: A boolean that determines whether or not the output is verbose. - data_privacy: A boolean that determines whether or not the data is private. - llm_api_key: The OpenAI API key to be used. - force_sandbox: if False and the sandbox fails, it will retry using eval (less safe) - - The constructor also calls the parent class's constructor. - - - Args: - data (pandas dataframe, mandatory): dataset to query. Defaults to None. - llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai". - llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2". - prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False. - custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "". - path (str, optional): the path where the files containing debug data will be save. Defaults to None. - verbose (bool, optional): if True debugging info will be printed. Defaults to False. - data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True. - llm_api_key (str, optional): the Open API key. Defaults to None. - force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False. - """ - - - super().__init__(data, *args, **kwargs) - - self.llm_params = llm_params or {} - - # Set up OpenAI API key from the environment or the config - self.llm_api_key = llm_api_key or os.environ.get("OPENAI_API_KEY") - - self.llm_engine = llm_engine - self.llm_params = llm_params or {} - self.model = self.llm_params.get("model", self.llm_default_model) - self.temperature = self.llm_params.get("temperature", self.llm_default_temperature) - - self.prompt_override = prompt_override - self.custom_prompt = custom_prompt - - self.data_privacy = data_privacy - self.path = path - self.verbose = verbose - self.force_sandbox = force_sandbox - - def _buildPromptForRole(self): - prompt_role = f""" -I want you to act as a data scientist and Python coder. I want you code for me. -I have a dataset of {len(self)} rows and {len(self.columns)} columns. -Columns and their type are the following: - """ - - for col in self.columns: - col_type = self.dtypes[col] - prompt_role += f"{col} ({col_type})\n" - - return prompt_role - - def _buildPromptForProblemSolving(self, request): - - if self.prompt_override: - return self.custom_prompt - - columns = "" - for col in self.columns: - col_type = self.dtypes[col] - columns += f"{col} ({col_type})\n" - - prompt_problem = f""" -Given a DataFrame named 'df' of {len(self)} rows and {len(self.columns)} columns, -Its columns are the following: - -{columns} - -I want you to solve the following problem: -write a Python code snippet that addresses the following request: -{request} - -While crafting the code, please follow these guidelines: -1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search. -2. Ensure that the answer is a single line of code without explanations, comments, or additional details. -3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'. -4. Assign the resulting code to the variable 'result'. -5. Avoid importing any additional libraries than pandas and numpy. - -""" - if not self.custom_prompt is None and len(self.custom_prompt) > 0: - - prompt_problem += f""" - Also: - {self.custom_prompt} - """ - - return prompt_problem - - def _extractPythonCode(self, text: str, regexp: str) -> str: - # Define the regular expression pattern for the Python code block - pattern = regexp - - # Search for the pattern in the input text - match = re.search(pattern, text, re.DOTALL) - - # If a match is found, return the extracted code (without the markers) - if match: - return match.group(1).strip() - - # If no match is found, return an empty string - return "" - - def _print(self, *args, **kwargs): - if self.verbose: - print(*args, **kwargs) - - # def _variable_to_string(self, variable): - # if variable is None: return None - # try: - - # if isinstance(variable, pd.Series): - # # convert to dataframe - # variable = variable.to_frame() - - # if isinstance(variable, pd.DataFrame): - # variable = variable.drop_duplicates() - # if len(variable) == 0: return None - # return str(variable) - - # elif isinstance(variable, np.ndarray): - # if len(variable) == 0: return None - # return np.array2string(variable) - # else: - # # Convert the variable to a string - # return str(variable) - # except Exception as e: - # return str(variable) - - - def _save(self,name,value): - if self.path is None or self.path == "": - return - try: - with open(f"{self.path}/{name}", 'w') as file: - file.write(value) - except Exception as e: - self._print(f"error {e}") - return - - def _execInSandbox(self, df, generated_code:str): - - # Create a Sandbox instance and allow pandas to be imported - sandbox = Sandbox() - sandbox.allow_import("pandas") - sandbox.allow_import("numpy") - - # Define the initial code to set up the DataFrame - initial_code = f""" -import pandas as pd -import datetime -from pandas import Timestamp -import numpy as np - - """ - - # Combine the initial code and the generated code - full_code = initial_code + "\n" + generated_code - - self._save("temp/prompt_code.py",full_code) - # Execute the combined code in the Sandbox - sandbox_result = sandbox.execute(full_code, {"df":df}) - - # Get the result from the local_vars dictionary - result = sandbox_result.get("result") - return result - - def prompt(self, request: str): - """ - - Args: - request (str): prompt containing the request. it must be expressed as a question or a problem to solve - - Returns: - Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float - """ - - # Set up OpenAI API key - openai.api_key = self.llm_api_key - - messages=[ - {"role": "system", - "content": self._buildPromptForRole()}, - {"role": "user", - "content": self._buildPromptForProblemSolving(request) - } - ] - - response = None - for times in range(0,3): - try: - response = openai.ChatCompletion.create( - model=self.model, - temperature=self.temperature, - messages = messages - ) - break; - except Exception as e: - self._print(f"error {e}") - continue - - if response is None: - return "Please try later" - - self._save("temp/prompt_cmd.json",json.dumps(messages, indent=4)) - - generated_code = response.choices[0].message.content - if generated_code == "" or generated_code is None: - self.code_block = "" - return None - - self.code_block = generated_code - - results=[] - for regexp in self.code_blocks: - cleaned_code = self._extractPythonCode(generated_code,regexp) - if cleaned_code == "" or cleaned_code is None: - continue - results.append(cleaned_code) - results.append(generated_code) - - if len(results) == 0: - return None - - result = None - for cleaned_code in results: - - try: - result = self._execInSandbox(self, cleaned_code) - except Exception as e: - self._print(f"error {e}") - if not self.force_sandbox: - try: - expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip() - result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result}) - except Exception as e: - self._print(f"error {e}") - pass - - if result is not None and str(result) != "": - break - - if self.data_privacy == True: - # non formatted result - return result - - # currently the privacy option is not needed. - # in the future, we can choose to send data to LLM if privacy is set to false - - return result - - \ No newline at end of file diff --git a/pandas_llm/example-chatbot.py b/pandas_llm/example-chatbot.py deleted file mode 100644 index a5f9767..0000000 --- a/pandas_llm/example-chatbot.py +++ /dev/null @@ -1,80 +0,0 @@ -import os -import pandas as pd - -import sys -from pathlib import Path -sys.path.append(str(Path(__file__).resolve().parent.parent)) - -from pandas_llm import PandasLLM - -# Data -# Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations. -data = [('John Doe', 25, 50), - ('Jane Smith', 38, 70), - ('Alex Johnson', 45, 80), - ('Jessica Brown', 60, 40), - ('Michael Davis', 22, 90), - ('Emily Wilson', 30, 60), - ('Daniel Taylor', 35, 75), - ('Sophia Moore', 40, 85), - ('David Thomas', 50, 65), - ('Olivia Jackson', 29, 55), - ('Carlos GarcĂ­a', 22, 50), - ('Ana Rodriguez', 38, 70), - ('Luis Hernandez', 45, 80), - ('Sofia Martinez', 60, 40), - ('Miguel Lopez', 22, 90), - ('Isabella Gonzalez', 30, 60), - ('Diego Perez', 35, 75), - ('Maria Sanchez', 40, 85), - ('Juan Pena', 50, 65), - ('Gabriela Ramirez', 29, 55), - ('Giovanni Rossi', 22, 50), - ('Maria Bianchi', 38, 70), - ('Luca Ferrari', 45, 80), - ('Sofia Russo', 60, 40), - ('Francesco Romano', 22, 90), - ('Isabella Colombo', 30, 60), - ('Alessandro Ricci', 35, 75), - ('Giulia Marino', 40, 85), - ('Antonio Greco', 50, 65), - ('Gabriella Bruno', 29, 55)] - -# Create DataFrame -df = pd.DataFrame(data, columns=['name', 'age', 'donation']) - -# Print DataFrame -print(df) - - -def main(): - - # Initialise library and set the OpenAI API key - conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) - print() - banner = """ - Welcome to the Donation Data CLI. - The donation dataset has three columns (name, age, donation) - Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations. - - You can ask questions like: - - show me the list of names - - What is the average age of people who donated? - - What is the average donation amount? - - What is the average donation of people older than 30? - - What is the average donation of people older than 30 who donated more than $50? - """ - print(banner) - - while True: - prompt = input("Enter your query (or 'exit' to quit): ") - if prompt.lower() == "exit": - break - - result = conv_df.prompt(prompt) - code = conv_df.code_block - print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") - - -if __name__ == "__main__": - main() diff --git a/pandas_llm/example.py b/pandas_llm/example.py deleted file mode 100644 index 4585f98..0000000 --- a/pandas_llm/example.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import pandas as pd - -import sys -from pathlib import Path -sys.path.append(str(Path(__file__).resolve().parent.parent)) -from pandas_llm import PandasLLM - -# Data -# Please note that these names, ages, and donations are randomly generated -# and do not correspond to real individuals or their donations. -data = [('John Doe', 25, 50), - ('Jane Smith', 38, 70), - ('Alex Johnson', 45, 80), - ('Jessica Brown', 60, 40), - ('Michael Davis', 22, 90), - ('Emily Wilson', 30, 60), - ('Daniel Taylor', 35, 75), - ('Sophia Moore', 40, 85), - ('David Thomas', 50, 65), - ('Olivia Jackson', 29, 55)] -df = pd.DataFrame(data, columns=['name', 'age', 'donation']) - -conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) -result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?") -code = conv_df.code_block - -print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") -# Executing the following expression of type : -# result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean() - -# Result is: -# 72.5 diff --git a/pandas_llm/pyvenv.cfg b/pandas_llm/pyvenv.cfg deleted file mode 100644 index f18227c..0000000 --- a/pandas_llm/pyvenv.cfg +++ /dev/null @@ -1,3 +0,0 @@ -home = /usr/local/opt/python@3.9/bin -include-system-site-packages = false -version = 3.9.16 diff --git a/pandas_llm/requirements.txt b/pandas_llm/requirements.txt deleted file mode 100644 index cdaed3e..0000000 --- a/pandas_llm/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -aiohttp==3.8.4 -aiosignal==1.3.1 -async-timeout==4.0.2 -attrs==23.1.0 -certifi==2023.5.7 -charset-normalizer==3.1.0 -frozenlist==1.3.3 -idna==3.4 -multidict==6.0.4 -numpy==1.24.3 -openai==0.27.6 -pandas==2.0.1 -python-dateutil==2.8.2 -pytz==2023.3 -requests==2.30.0 -RestrictedPython==6.0 -six==1.16.0 -tqdm==4.65.0 -tzdata==2023.3 -urllib3==2.0.2 -yarl==1.9.2 diff --git a/requirements.txt b/requirements.txt index cdaed3e..e157541 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,4 @@ -aiohttp==3.8.4 -aiosignal==1.3.1 -async-timeout==4.0.2 -attrs==23.1.0 -certifi==2023.5.7 -charset-normalizer==3.1.0 -frozenlist==1.3.3 -idna==3.4 -multidict==6.0.4 -numpy==1.24.3 -openai==0.27.6 -pandas==2.0.1 -python-dateutil==2.8.2 -pytz==2023.3 -requests==2.30.0 -RestrictedPython==6.0 -six==1.16.0 -tqdm==4.65.0 -tzdata==2023.3 -urllib3==2.0.2 -yarl==1.9.2 +RestrictedPython +pandas +openai +pydantic \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 224a779..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -description-file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py index 5515c65..440892c 100644 --- a/setup.py +++ b/setup.py @@ -1,55 +1,14 @@ from setuptools import setup, find_packages -# Reads the content of your README.md into a variable to be used in the setup below -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - setup( - name='pandas_llm', # should match the package folder - version='0.0.6', # important for updates - license='MIT', # should match your chosen license - description='Conversational Pandas Dataframes', - long_description=long_description, # loads your README.md - long_description_content_type="text/markdown", # README.md is of type 'markdown' - author='DashyDash', - author_email='alessio@dashydash.com', - url='https://github.com/DashyDashOrg/pandas-llm', - project_urls = { # Optional - "Bug Tracker": "https://github.com/DashyDashOrg/pandas-llm/issues" - }, - keywords=["pypi", "pandas-llm", "pandas", "llm", "ai", "openai", "chatgpt"], #descriptive meta-data + name="pandas-llm", + version="0.1.0", packages=find_packages(), - classifiers=[ # https://pypi.org/classifiers - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', - "Operating System :: OS Independent", - ], - python_requires='>=3.6', install_requires=[ - "aiohttp", - "aiosignal", - "async-timeout", - "attrs", - "certifi", - "charset-normalizer", - "frozenlist", - "idna", - "multidict", + "pandas", "numpy", "openai", - "pandas", - "python-dateutil", - "pytz", - "requests", "RestrictedPython", - "six", - "tqdm", - "tzdata", - "urllib3", - "yarl", - ], - download_url="https://github.com/DashyDashOrg/pandas-llm/releases/tag/v0.0.6", + ], + python_requires=">=3.7", ) \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pandas_query.py b/src/pandas_query.py new file mode 100644 index 0000000..617619c --- /dev/null +++ b/src/pandas_query.py @@ -0,0 +1,291 @@ +# src/pandas_query.py +import pandas as pd +import numpy as np +from RestrictedPython import compile_restricted +from RestrictedPython.Guards import safe_builtins, guarded_iter_unpack_sequence +from RestrictedPython.Eval import default_guarded_getattr, default_guarded_getitem, default_guarded_getiter +from openai import OpenAI +import os +from typing import Dict, Any, Optional, List +from pydantic import BaseModel, Field, validator +from .pandas_validator import PandasQueryValidator + + +class QueryResult(BaseModel): + """Pydantic model for query execution results.""" + query: str = Field(..., description="Original query string") + code: str = Field(..., description="Generated pandas code") + is_valid: bool = Field(..., description="Whether the query is valid") + errors: List[str] = Field(default_factory=list, description="List of validation/execution errors") + result: Optional[Any] = Field(None, description="Query execution result") + + class Config: + arbitrary_types_allowed = True + json_encoders = { + pd.DataFrame: lambda df: df.to_dict(orient='records'), + pd.Series: lambda s: s.to_dict(), + np.ndarray: lambda arr: arr.tolist(), + np.int64: lambda x: int(x), + np.float64: lambda x: float(x) + } + + def _serialize_value(self, v: Any) -> Any: + """Helper method to serialize values.""" + if isinstance(v, pd.DataFrame): + return v.to_dict(orient='records') + elif isinstance(v, pd.Series): + return v.to_dict() + elif isinstance(v, np.ndarray): + return v.tolist() + elif isinstance(v, (np.int64, np.float64)): + return float(v) + elif isinstance(v, dict): + return {k: self._serialize_value(v) for k, v in v.items()} + elif isinstance(v, list): + return [self._serialize_value(item) for item in v] + return v + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to ensure all values are serializable.""" + data = { + 'query': self.query, + 'code': self.code, + 'is_valid': self.is_valid, + 'errors': self.errors, + 'result': self._serialize_value(self.result), + } + return data + + def get_results(self) -> Dict[str, Any]: + """Get a simplified dictionary of just the key results.""" + return { + 'valid': self.is_valid, + 'result': self._serialize_value(self.result) if self.is_valid else None, + 'errors': self.errors if not self.is_valid else [], + } + + @validator('result', pre=True) + def validate_result(cls, v): + """Convert pandas/numpy results to native Python types.""" + if isinstance(v, pd.DataFrame): + return v.to_dict(orient='records') + elif isinstance(v, pd.Series): + return v.to_dict() + elif isinstance(v, np.ndarray): + return v.tolist() + elif isinstance(v, (np.int64, np.float64)): + return float(v) + return v + + +class PandasQuery: + def __init__( + self, + model: str = "gpt-4", + temperature: float = 0.2, + api_key: Optional[str] = None, + validate: bool = True + ): + self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY")) + self.model = model + self.temperature = temperature + self.validate = validate + self.restricted_globals = self._setup_restricted_globals() + + def execute(self, df: pd.DataFrame, query: str) -> QueryResult: + """Execute a natural language query with validation and return comprehensive results.""" + import time + + # Initialize result with Pydantic model + query_result = QueryResult( + query=query, + code="", + is_valid=False, + errors=[], + result=None, + ) + + try: + # Get code from LLM + response = self.client.chat.completions.create( + model=self.model, + temperature=self.temperature, + messages=[ + {"role": "user", "content": self._build_prompt(df, query)} + ] + ) + + code = response.choices[0].message.content.strip() + code = self._clean_code(code) + query_result.code = code + + # Validate if required + if self.validate: + validator = PandasQueryValidator(df) + validation_result = validator.get_validation_result(code) + + if not validation_result['is_valid']: + query_result.errors = validation_result['errors'] + return query_result + + # Use suggested correction if available + if validation_result['suggested_correction']: + code = validation_result['suggested_correction'] + query_result.code = code + + # Execute if valid + result = self._execute_in_sandbox(code, df) + + query_result.is_valid = True + query_result.result = result + + except Exception as e: + query_result.errors.append(f"Execution error: {str(e)}") + + return query_result + + def _setup_restricted_globals(self) -> Dict: + """Set up restricted globals for sandbox execution.""" + # Core pandas Series methods + series_methods = [ + "sum", "mean", "any", "argmax", "argmin", "count", + "diff", "dropna", "fillna", "head", "max", "min", + "sort_values", "unique", "isna", "astype" + ] + + restricted_globals = { + "__builtins__": dict(safe_builtins), + "pd": pd, + "np": np, + "_getattr_": default_guarded_getattr, + "_getitem_": default_guarded_getitem, + "_getiter_": default_guarded_getiter, + "_iter_unpack_sequence_": guarded_iter_unpack_sequence, + } + + # Add series methods + restricted_globals.update({ + method: getattr(pd.Series, method) for method in series_methods + }) + + return restricted_globals + + def _build_prompt(self, df: pd.DataFrame, query: str) -> str: + """Build a detailed prompt with DataFrame information and query context.""" + # Convert DataFrame info to dictionary for better LLM interpretation + df_info = { + "metadata": { + "rows": len(df), + "columns": len(df.columns) + }, + "columns": {} + } + + for col in df.columns: + df_info["columns"][col] = { + "dtype": str(df[col].dtype), + "null_count": int(df[col].isna().sum()), + "unique_count": int(df[col].nunique()), + "sample_values": df[col].dropna().sample(min(3, len(df))).tolist() + } + if pd.api.types.is_numeric_dtype(df[col].dtype): + df_info["columns"][col].update({ + "min": float(df[col].min()) if not pd.isna(df[col].min()) else None, + "max": float(df[col].max()) if not pd.isna(df[col].max()) else None + }) + + prompt = f"""Given a pandas DataFrame with the following structure: + ``` + {df_info} + ``` + + Write a single line of Python code that answers this question: {query} + + Requirements: + 1. Assign result to 'result' variable + 2. Handle null values appropriately: + - For string operations: Use .fillna('') before .str operations + - For numeric operations: Use .fillna(0) or .dropna() as appropriate + - For boolean operations: Use .fillna(False) + + 3. String Operations Guidelines: + - Use .str accessor for string operations + - For case-insensitive matching: Use .str.lower() + - For counting: Use .str.count(pattern) + - For starts/ends with: Use .str.startswith() or .str.endswith() + - For contains: Use .str.contains(pattern, case=True/False) + - Always handle null values before string operations + + 4. Numeric Operations Guidelines: + - For string-to-numeric conversion: Use pd.to_numeric(df['column'], errors='coerce') + - For aggregations (sum, mean, etc.), only use with groupby + - For comparisons, use standard operators (>, <, >=, <=, ==, !=) + + 5. Date Operations Guidelines: + - Use .dt accessor for datetime operations + - Common attributes: .dt.year, .dt.month, .dt.day + - For date comparisons, use standard operators + + 6. Filtering Guidelines: + - Use boolean indexing: df[condition] + - For multiple conditions, use & (and) and | (or) with parentheses + - Example: df[(condition1) & (condition2)] + + 7. Return Guidelines: + - Return only the matching rows unless aggregation is specifically requested + - Do not include explanatory comments in the code + - Keep to a single line of code + + Available String Operations: + - Basic: contains, startswith, endswith, lower, upper, strip + - Count/Match: count, match, extract, find, findall + - Transform: replace, pad, center, slice, split + + Available Numeric Operations: + - Comparisons: >, <, >=, <=, ==, != + - Aggregations (with groupby only): sum, mean, median, min, max, count + + Example Patterns: + - String search: df[df['column'].fillna('').str.contains('pattern')] + - Multiple conditions: df[(df['col1'] > 0) & (df['col2'].str.startswith('prefix'))] + - Numeric filtering: df[pd.to_numeric(df['column'], errors='coerce') > value] + - Case-insensitive: df[df['column'].fillna('').str.lower().str.contains('pattern')] + + Return only the code, no explanations.""" + + return prompt + + + + @staticmethod + def _clean_code(code: str) -> str: + """Clean up code from LLM response.""" + if code.startswith("```"): + code = code.split("\n", 1)[1].rsplit("\n", 1)[0] + if code.startswith("python"): + code = code.split("\n", 1)[1] + return code.strip("` \n") + + def _execute_in_sandbox(self, code: str, df: pd.DataFrame) -> Any: + """Execute code in RestrictedPython sandbox.""" + byte_code = compile_restricted( + source=code, + filename='', + mode='exec' + ) + + local_vars = {'df': df, 'result': None, 'pd': pd} + exec(byte_code, self.restricted_globals, local_vars) + + if local_vars['result'] is None: + raise ValueError("Execution produced no result") + + return local_vars['result'] + + @staticmethod + def _extract_column_references(code: str) -> set[str]: + """Extract column references from code.""" + import re + pattern = r"df[\['](\w+)[\]']|df\.(\w+)" + matches = re.findall(pattern, code) + return {match[0] or match[1] for match in matches} \ No newline at end of file diff --git a/src/pandas_validator.py b/src/pandas_validator.py new file mode 100644 index 0000000..c06298e --- /dev/null +++ b/src/pandas_validator.py @@ -0,0 +1,200 @@ +# src/pandas_validator.py +import pandas as pd +from typing import Dict, List, Optional + + +class PandasQueryValidator: + """Validates pandas query operations and provides suggestions for corrections.""" + + def __init__(self, df: pd.DataFrame): + """Initialize validator with DataFrame schema information.""" + self.dtypes = df.dtypes.to_dict() + self.columns = set(df.columns) + + # Valid pandas operations by data type - simplified to most common operations + self.valid_operations = { + 'object': { + 'string_ops': { + 'contains', 'startswith', 'endswith', 'count', # Added count explicitly + 'lower', 'upper', 'strip', 'len', 'slice', 'extract', + 'find', 'findall', 'replace', 'pad', 'center', 'split' + }, + 'comparisons': {'==', '!=', 'isin'} + }, + 'number': { + 'numeric_ops': {'sum', 'mean', 'min', 'max'}, + 'comparisons': {'>', '<', '>=', '<=', '==', '!='} + }, + 'datetime': { + 'date_ops': {'year', 'month', 'day'}, + 'comparisons': {'>', '<', '>=', '<=', '==', '!='} + }, + 'bool': { + 'comparisons': {'==', '!='} + } + } + + # Common pandas aggregation functions that require groupby + # Removed 'count' from here since it's also a string operation + self.group_required_aggs = { + 'sum', 'mean', 'median', 'min', 'max' + } + + + + def _extract_operations(self, code: str) -> Dict[str, List[str]]: + """Extract pandas operations from the code, categorizing them by type.""" + import re + + # Match string operations specifically + str_pattern = r'\.str\.(\w+)' + str_ops = re.findall(str_pattern, code) + + # Match other operations, excluding string operations + other_pattern = r'(? list[str]: + """Check for valid aggregation function usage.""" + errors = [] + operations = self._extract_operations(code) + + # Check only non-string operations for aggregation requirements + for op in operations['other_ops']: + if op in self.group_required_aggs: # count is no longer here + if 'groupby' not in code: + error_msg = f"Aggregation '{op}' used without groupby" + errors.append(error_msg) + + return errors + + def _check_operation_compatibility(self, code: str) -> list[str]: + """Check if operations are compatible with column data types.""" + errors = [] + operations = self._extract_operations(code) + column_refs = self._extract_column_references(code) + + for col in column_refs: + if col not in self.columns: + continue + + dtype = self.dtypes[col] + dtype_category = ( + 'number' if pd.api.types.is_numeric_dtype(dtype) + else 'datetime' if pd.api.types.is_datetime64_dtype(dtype) + else 'bool' if pd.api.types.is_bool_dtype(dtype) + else 'object' + ) + + # Check string operations + if operations['string_ops']: + if dtype_category != 'object': + errors.append( + f"String operations used on non-string column '{col}' " + f"of type {dtype}" + ) + else: + for op in operations['string_ops']: + if op not in self.valid_operations['object']['string_ops']: + errors.append( + f"String operation '{op}' may not be valid for " + f"column '{col}'" + ) + + # Check other operations + if dtype_category in self.valid_operations: + valid_ops = set().union( + *self.valid_operations[dtype_category].values() + ) + for op in operations['other_ops']: + if op not in valid_ops and op not in self.group_required_aggs: + errors.append( + f"Operation '{op}' may not be compatible with " + f"column '{col}' of type {dtype}" + ) + + return errors + + def _extract_column_references(self, code: str) -> set[str]: + """Extract column references from the code.""" + import re + # Match patterns like df['column'] or df.column + pattern = r"df[\['](\w+)[\]']|df\.(\w+)" + matches = re.findall(pattern, code) + # Flatten and filter matches + return {match[0] or match[1] for match in matches} + + + def _check_column_existence(self, code: str) -> list[str]: + """Check if all referenced columns exist in the DataFrame.""" + errors = [] + referenced_columns = self._extract_column_references(code) + + for col in referenced_columns: + if col not in self.columns: + similar_cols = [ + existing_col for existing_col in self.columns + if existing_col.lower() == col.lower() + ] + error_msg = f"Column '{col}' does not exist in DataFrame" + if similar_cols: + error_msg += f". Did you mean '{similar_cols[0]}'?" + errors.append(error_msg) + + return errors + + + def suggest_corrections(self, code: str) -> Optional[str]: + """Attempt to suggest corrections for common issues.""" + corrected = code + + # Fix column name case sensitivity + for col in self._extract_column_references(code): + if col not in self.columns: + for actual_col in self.columns: + if col.lower() == actual_col.lower(): + corrected = corrected.replace( + f"['{col}']", f"['{actual_col}']" + ) + corrected = corrected.replace( + f".{col}", f".{actual_col}" + ) + + # Add null handling for string operations + if '.str.' in corrected and 'fillna' not in corrected: + corrected = corrected.replace('.str.', '.fillna("").str.') + + if corrected != code: + return corrected + return None + + def validate_query(self, code: str) -> tuple[bool, list[str]]: + """Validate a pandas query code.""" + errors = [] + + # Run all essential checks + errors.extend(self._check_column_existence(code)) + errors.extend(self._check_operation_compatibility(code)) + errors.extend(self._check_aggregation_usage(code)) + + return len(errors) == 0, errors + + def get_validation_result(self, code: str) -> Dict: + """Get comprehensive validation results.""" + is_valid, errors = self.validate_query(code) + suggested_correction = None + + if not is_valid: + suggested_correction = self.suggest_corrections(code) + + return { + 'code': code.strip(), + 'is_valid': is_valid, + 'errors': errors, + 'suggested_correction': suggested_correction + } \ No newline at end of file