1+ # This is the configuration file for the rag agent with Google Drive integration
2+ #
3+ # This flow ingests documents from both filesystem and Google Drive, then retrieves similar content and documents
4+
5+ ---
6+ log :
7+ stdout_log_level : INFO
8+ log_file_level : INFO
9+ log_file : solace_ai_connector.log
10+
11+ shared_config :
12+ - broker_config : &broker_connection
13+ dev_mode : ${SOLACE_DEV_MODE}
14+ broker_url : ${SOLACE_BROKER_URL}
15+ broker_username : ${SOLACE_BROKER_USERNAME}
16+ broker_password : ${SOLACE_BROKER_PASSWORD}
17+ broker_vpn : ${SOLACE_BROKER_VPN}
18+ temporary_queue : ${USE_TEMPORARY_QUEUES}
19+
20+ flows :
21+ # Flow to handle action requests
22+ - name : rag_action_request_processor
23+ components :
24+ # Input from a Solace broker
25+ - component_name : broker_input
26+ component_module : broker_input
27+ component_config :
28+ << : *broker_connection
29+ payload_encoding : utf-8
30+ payload_format : json
31+ broker_queue_name : ${SOLACE_AGENT_MESH_NAMESPACE}agent_rag_action_request
32+ broker_subscriptions :
33+ # Subscribe to all rag actions
34+ - topic : ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/actionRequest/*/*/rag/>
35+ qos : 1
36+
37+ # Custom component to process the action request
38+ - component_name : action_request_processor
39+ component_base_path : .
40+ # path is completed at build time
41+ component_module : " {{MODULE_DIRECTORY}}.agents.rag.rag_agent_component"
42+ component_config :
43+ llm_service_topic : ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/llm-service/request/general-good/
44+ embedding_service_topic : ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/embedding-service/request/text/
45+ agent_name : rag
46+
47+ # Enhanced Scanner configuration with multiple sources
48+ scanner :
49+ batch : true
50+ use_memory_storage : true
51+
52+ # Multiple sources configuration (NEW FORMAT)
53+ sources :
54+ # File system source
55+ - type : filesystem
56+ directories :
57+ - " ${LOCAL_DOCUMENTS_PATH}" # e.g. "/path/to/local/documents"
58+ filters :
59+ file_formats :
60+ - " .txt"
61+ - " .pdf"
62+ - " .docx"
63+ - " .doc"
64+ - " .md"
65+ - " .html"
66+ - " .csv"
67+ - " .json"
68+ - " .odt"
69+ - " .xlsx"
70+ - " .xls"
71+ max_file_size : 10240 # in KB (10MB)
72+ schedule :
73+ interval : 60 # seconds
74+
75+ # Google Drive source with Service Account Authentication
76+ - type : google_drive
77+ provider : google_drive
78+ auth_type : " service_account" # Use Service Account instead of OAuth2
79+ service_account_key_path : " ${GOOGLE_SERVICE_ACCOUNT_KEY_PATH}" # e.g. "/path/to/service-account-key.json"
80+ folders :
81+ - folder_id : " ${GOOGLE_DRIVE_FOLDER_ID_1}" # e.g. "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms"
82+ name : " Documents"
83+ recursive : true
84+ type : " personal"
85+ - folder_id : " ${GOOGLE_DRIVE_FOLDER_ID_2}" # e.g. "0AEd3EhGff_FmUk9PVA"
86+ name : " Shared Drive"
87+ recursive : true
88+ type : " shared_drive"
89+ filters :
90+ file_formats :
91+ - " .txt"
92+ - " .pdf"
93+ - " .docx"
94+ - " .doc"
95+ - " .md"
96+ - " .html"
97+ - " .csv"
98+ - " .json"
99+ - " .odt"
100+ - " .xlsx"
101+ - " .xls"
102+ max_file_size : 10240 # in KB (10MB)
103+ include_google_formats : true # Include Google Docs, Sheets, Slides
104+ real_time :
105+ enabled : true
106+ webhook_url : " ${GOOGLE_DRIVE_WEBHOOK_URL}" # e.g. "https://your-domain.com/webhook/google-drive"
107+ polling_interval : 300 # Fallback polling in seconds (5 minutes)
108+
109+ # Legacy database configuration (optional for persistent metadata storage)
110+ database : # optional for persistent metadata storage
111+ # TODO: Deprecated and will be removed in the next version, enable use_memory_storage instead
112+ type : postgresql
113+ dbname : ${DB_NAME}
114+ host : ${DB_HOST}
115+ port : ${DB_PORT}
116+ user : ${DB_USER}
117+ password : ${DB_PASSWORD}
118+
119+ # Preprocessor configuration
120+ preprocessor :
121+ default_preprocessor :
122+ type : enhanced
123+ params :
124+ lowercase : true
125+ normalize_whitespace : true
126+ remove_stopwords : false
127+ remove_punctuation : false
128+ remove_numbers : false
129+ remove_non_ascii : false
130+ remove_urls : true
131+ remove_emails : false
132+ remove_html_tags : false
133+
134+ preprocessors :
135+ # Text file configurations
136+ text :
137+ type : text
138+ params :
139+ lowercase : true
140+ normalize_whitespace : true
141+ remove_stopwords : false
142+ remove_punctuation : true
143+ remove_numbers : false
144+ remove_non_ascii : false
145+ remove_urls : true
146+ remove_emails : false
147+ remove_html_tags : false
148+
149+ # Document file configurations
150+ pdf :
151+ type : document
152+ params :
153+ lowercase : true
154+ normalize_whitespace : true
155+ remove_stopwords : false
156+ remove_punctuation : true
157+ remove_numbers : false
158+ remove_non_ascii : true
159+ remove_urls : true
160+ remove_emails : true
161+ remove_html_tags : false
162+
163+ doc :
164+ type : document
165+ params :
166+ lowercase : true
167+ normalize_whitespace : true
168+ remove_stopwords : false
169+ remove_punctuation : true
170+ remove_numbers : false
171+ remove_non_ascii : true
172+ remove_urls : true
173+ remove_emails : true
174+ remove_html_tags : false
175+
176+ odt :
177+ type : document
178+ params :
179+ lowercase : true
180+ normalize_whitespace : true
181+ remove_stopwords : false
182+ remove_punctuation : true
183+ remove_numbers : false
184+ remove_non_ascii : true
185+ remove_urls : true
186+ remove_emails : true
187+ remove_html_tags : false
188+
189+ # Structured data configurations
190+ json :
191+ type : structured
192+ params :
193+ lowercase : true
194+ normalize_whitespace : true
195+ remove_stopwords : false
196+ remove_punctuation : false
197+ remove_numbers : false
198+ remove_non_ascii : false
199+ remove_urls : true
200+ remove_emails : true
201+ remove_html_tags : false
202+
203+ html :
204+ type : html
205+ params :
206+ lowercase : true
207+ normalize_whitespace : true
208+ remove_stopwords : false
209+ remove_punctuation : false
210+ remove_numbers : false
211+ remove_non_ascii : false
212+ remove_urls : true
213+ remove_emails : true
214+ remove_html_tags : false
215+
216+ markdown :
217+ type : markdown
218+ params :
219+ lowercase : true
220+ normalize_whitespace : true
221+ remove_stopwords : false
222+ remove_punctuation : false
223+ remove_numbers : false
224+ remove_non_ascii : false
225+ remove_urls : true
226+ remove_emails : true
227+ remove_html_tags : false
228+
229+ csv :
230+ type : csv
231+ params :
232+ lowercase : true
233+ normalize_whitespace : true
234+ remove_stopwords : false
235+ remove_punctuation : true
236+ remove_numbers : false
237+ remove_non_ascii : false
238+ remove_urls : true
239+ remove_emails : true
240+ remove_html_tags : false
241+
242+ xls :
243+ type : xls
244+ params :
245+ lowercase : true
246+ normalize_whitespace : true
247+ remove_stopwords : false
248+ remove_punctuation : true
249+ remove_numbers : false
250+ remove_non_ascii : false
251+ remove_urls : true
252+ remove_emails : true
253+ remove_html_tags : false
254+
255+
256+ # Text splitter configuration
257+ splitter :
258+ default_splitter :
259+ type : character
260+ params :
261+ chunk_size : 2048 # minimum chunk size
262+ chunk_overlap : 800
263+ separator : " "
264+ splitters :
265+ # Text file configurations
266+ text :
267+ type : character
268+ params :
269+ chunk_size : 2048 # minimum chunk size
270+ chunk_overlap : 800
271+ separator : " "
272+ is_separator_regex : false
273+ keep_separator : true
274+ strip_whitespace : true
275+ txt :
276+ type : character
277+ params :
278+ chunk_size : 2048 # minimum chunk size
279+ chunk_overlap : 800
280+ separator : " \n "
281+ is_separator_regex : false
282+ keep_separator : true
283+ strip_whitespace : true
284+ # Structured data configurations
285+ json :
286+ type : recursive_json
287+ params :
288+ chunk_size : 200
289+ chunk_overlap : 50
290+ html :
291+ type : html
292+ params :
293+ chunk_size : 2048
294+ chunk_overlap : 800
295+ tags_to_extract : ["p", "h1", "h2", "h3", "li"]
296+ markdown :
297+ type : markdown
298+ params :
299+ chunk_size : 2048
300+ chunk_overlap : 800
301+ headers_to_split_on : ["#", "##", "###", "####", "#####", "######"]
302+ strip_headers : false
303+ csv :
304+ type : csv
305+ params :
306+ chunk_size : 2048 # chunk size in number of rows
307+ include_header : false
308+ # Add Xml, Odt, Xlsx, and other formats as needed
309+
310+ # Embedding configuration
311+ embedding : # LiteLLM embedding
312+ embedder_type : " openai"
313+ embedder_params :
314+ # OpenAI embeddings
315+ model : ${OPENAI_EMBEDDING_MODEL}
316+ api_key : ${OPENAI_API_KEY}
317+ api_base : ${OPENAI_API_ENDPOINT}
318+ batch_size : 32
319+ additional_kwargs : {}
320+ normalize_embeddings : True
321+
322+ # Vector database configuration
323+ vector_db :
324+ # Qdrant
325+ db_type : " qdrant"
326+ db_params :
327+ url : ${QDRANT_URL}
328+ api_key : ${QDRANT_API_KEY}
329+ collection_name : ${QDRANT_COLLECTION}
330+ embedding_dimension : ${QDRANT_EMBEDDING_DIMENSION}
331+
332+ llm :
333+ load_balancer :
334+ - model_name : " gpt-4o" # model alias
335+ litellm_params :
336+ model : openai/${OPENAI_MODEL_NAME}
337+ api_key : ${OPENAI_API_KEY}
338+ api_base : ${OPENAI_API_ENDPOINT}
339+ temperature : 0.01
340+ # add any other parameters here
341+ - model_name : " claude-3-5-sonnet" # model alias
342+ litellm_params :
343+ model : anthropic/${ANTHROPIC_MODEL_NAME}
344+ api_key : ${ANTHROPIC_API_KEY}
345+ api_base : ${ANTHROPIC_API_ENDPOINT}
346+ # add any other parameters here
347+ # add more models here
348+
349+ retrieval :
350+ top_k : 7
351+
352+ broker_request_response :
353+ enabled : true
354+ broker_config : *broker_connection
355+ request_expiry_ms : 120000
356+ payload_encoding : utf-8
357+ payload_format : json
358+ response_topic_prefix : ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1
359+ response_queue_prefix : ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1
360+ component_input :
361+ source_expression : input.payload
362+
363+ # Output to a Solace broker
364+ - component_name : broker_output
365+ component_module : broker_output
366+ component_config :
367+ << : *broker_connection
368+ payload_encoding : utf-8
369+ payload_format : json
370+ copy_user_properties : true
0 commit comments