From 7a34562d50cc8015bb38b7d98c11becdc685e0ac Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 13 Jun 2024 21:41:54 +0200 Subject: [PATCH] refactoring of merging answers nodes --- .../nodes/generate_answer_csv_node.py | 7 +- scrapegraphai/nodes/generate_answer_node.py | 10 +- .../nodes/generate_answer_omni_node.py | 7 +- .../nodes/generate_answer_pdf_node.py | 7 +- tests/inputs/books.xml | 120 ++++++++++++ tests/inputs/example.json | 182 ++++++++++++++++++ tests/inputs/plain_html_example.txt | 105 ++++++++++ tests/inputs/username.csv | 7 + 8 files changed, 427 insertions(+), 18 deletions(-) create mode 100644 tests/inputs/books.xml create mode 100644 tests/inputs/example.json create mode 100644 tests/inputs/plain_html_example.txt create mode 100644 tests/inputs/username.csv diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 7440d17d..941d3a2e 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -117,6 +117,9 @@ def execute(self, state): "format_instructions": format_instructions, }, ) + + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) else: prompt = PromptTemplate( template=template_chunks_csv, @@ -145,10 +148,6 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) # Update the state with the generated answer state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index c6b8c388..10437f12 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -99,7 +99,9 @@ def execute(self, state: dict) -> dict: input_variables=["question"], partial_variables={"context": chunk.page_content, "format_instructions": format_instructions}) - + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) + else: prompt = PromptTemplate( template=template_chunks, @@ -125,11 +127,7 @@ def execute(self, state: dict) -> dict: ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) # Update the state with the generated answer state.update({self.output[0]: answer}) - return state \ No newline at end of file + return state diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 480459e3..879ac5b1 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -105,6 +105,9 @@ def execute(self, state: dict) -> dict: "img_desc": imag_desc, }, ) + + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) else: prompt = PromptTemplate( template=template_chunks_omni, @@ -136,10 +139,6 @@ def execute(self, state: dict) -> dict: ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) # Update the state with the generated answer state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 897e1c56..aefdc8c8 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -117,6 +117,9 @@ def execute(self, state): "format_instructions": format_instructions, }, ) + + chain = prompt | self.llm_model | output_parser + answer = chain.invoke({"question": user_prompt}) else: prompt = PromptTemplate( template=template_chunks_pdf, @@ -145,10 +148,6 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser answer = merge_chain.invoke({"context": answer, "question": user_prompt}) - else: - # Chain - single_chain = list(chains_dict.values())[0] - answer = single_chain.invoke({"question": user_prompt}) # Update the state with the generated answer state.update({self.output[0]: answer}) diff --git a/tests/inputs/books.xml b/tests/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/tests/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/tests/inputs/example.json b/tests/inputs/example.json new file mode 100644 index 00000000..2263184c --- /dev/null +++ b/tests/inputs/example.json @@ -0,0 +1,182 @@ +{ + "kind":"youtube#searchListResponse", + "etag":"q4ibjmYp1KA3RqMF4jFLl6PBwOg", + "nextPageToken":"CAUQAA", + "regionCode":"NL", + "pageInfo":{ + "totalResults":1000000, + "resultsPerPage":5 + }, + "items":[ + { + "kind":"youtube#searchResult", + "etag":"QCsHBifbaernVCbLv8Cu6rAeaDQ", + "id":{ + "kind":"youtube#video", + "videoId":"TvWDY4Mm5GM" + }, + "snippet":{ + "publishedAt":"2023-07-24T14:15:01Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Football Clubs Kylian Mbappe Should Avoid Signing ✍️❌⚽️ #football #mbappe #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/TvWDY4Mm5GM/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T14:15:01Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"0NG5QHdtIQM_V-DBJDEf-jK_Y9k", + "id":{ + "kind":"youtube#video", + "videoId":"aZM_42CcNZ4" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:09:27Z", + "channelId":"UCM5gMM_HqfKHYIEJ3lstMUA", + "title":"Which Football Club Could Cristiano Ronaldo Afford To Buy? 💰", + "description":"Sign up to Sorare and get a FREE card: https://sorare.pxf.io/NellisShorts Give Soraredata a go for FREE: ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/aZM_42CcNZ4/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"John Nellis", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:09:27Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"WbBz4oh9I5VaYj91LjeJvffrBVY", + "id":{ + "kind":"youtube#video", + "videoId":"wkP3XS3aNAY" + }, + "snippet":{ + "publishedAt":"2023-07-24T16:00:50Z", + "channelId":"UC4EP1dxFDPup_aFLt0ElsDw", + "title":"PAULO DYBALA vs THE WORLD'S LONGEST FREEKICK WALL", + "description":"Can Paulo Dybala curl a football around the World's longest free kick wall? We met up with the World Cup winner and put him to ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/wkP3XS3aNAY/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Shoot for Love", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T16:00:50Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"juxv_FhT_l4qrR05S1QTrb4CGh8", + "id":{ + "kind":"youtube#video", + "videoId":"rJkDZ0WvfT8" + }, + "snippet":{ + "publishedAt":"2023-07-24T10:00:39Z", + "channelId":"UCO8qj5u80Ga7N_tP3BZWWhQ", + "title":"TOP 10 DEFENDERS 2023", + "description":"SoccerKingz https://soccerkingz.nl Use code: 'ILOVEHOF' to get 10% off. TOP 10 DEFENDERS 2023 Follow us! • Instagram ...", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/rJkDZ0WvfT8/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"Home of Football", + "liveBroadcastContent":"none", + "publishTime":"2023-07-24T10:00:39Z" + } + }, + { + "kind":"youtube#searchResult", + "etag":"wtuknXTmI1txoULeH3aWaOuXOow", + "id":{ + "kind":"youtube#video", + "videoId":"XH0rtu4U6SE" + }, + "snippet":{ + "publishedAt":"2023-07-21T16:30:05Z", + "channelId":"UCwozCpFp9g9x0wAzuFh0hwQ", + "title":"3 Things You Didn't Know About Erling Haaland ⚽️🇳🇴 #football #haaland #shorts", + "description":"", + "thumbnails":{ + "default":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/default.jpg", + "width":120, + "height":90 + }, + "medium":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/mqdefault.jpg", + "width":320, + "height":180 + }, + "high":{ + "url":"https://i.ytimg.com/vi/XH0rtu4U6SE/hqdefault.jpg", + "width":480, + "height":360 + } + }, + "channelTitle":"FC Motivate", + "liveBroadcastContent":"none", + "publishTime":"2023-07-21T16:30:05Z" + } + } + ] +} \ No newline at end of file diff --git a/tests/inputs/plain_html_example.txt b/tests/inputs/plain_html_example.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/tests/inputs/plain_html_example.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/tests/inputs/username.csv b/tests/inputs/username.csv new file mode 100644 index 00000000..006ac8e6 --- /dev/null +++ b/tests/inputs/username.csv @@ -0,0 +1,7 @@ +Username; Identifier;First name;Last name +booker12;9012;Rachel;Booker +grey07;2070;Laura;Grey +johnson81;4081;Craig;Johnson +jenkins46;9346;Mary;Jenkins +smith79;5079;Jamie;Smith +