Skip to content

Commit 74bb986

Browse files
authored
Merge pull request #30 from SolaceLabs/ap/DATAGO-103325/add_GCP_scanner
Ap/datago 103325/add gcp scanner
2 parents 2c549d7 + 6260326 commit 74bb986

File tree

15 files changed

+3707
-260
lines changed

15 files changed

+3707
-260
lines changed

sam-rag/configs/agents/rag_multi_cloud.yaml

Lines changed: 422 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
# This is the configuration file for the rag agent with Google Drive integration
2+
#
3+
# This flow ingests documents from both filesystem and Google Drive, then retrieves similar content and documents
4+
5+
---
6+
log:
7+
stdout_log_level: INFO
8+
log_file_level: INFO
9+
log_file: solace_ai_connector.log
10+
11+
shared_config:
12+
- broker_config: &broker_connection
13+
dev_mode: ${SOLACE_DEV_MODE}
14+
broker_url: ${SOLACE_BROKER_URL}
15+
broker_username: ${SOLACE_BROKER_USERNAME}
16+
broker_password: ${SOLACE_BROKER_PASSWORD}
17+
broker_vpn: ${SOLACE_BROKER_VPN}
18+
temporary_queue: ${USE_TEMPORARY_QUEUES}
19+
20+
flows:
21+
# Flow to handle action requests
22+
- name: rag_action_request_processor
23+
components:
24+
# Input from a Solace broker
25+
- component_name: broker_input
26+
component_module: broker_input
27+
component_config:
28+
<<: *broker_connection
29+
payload_encoding: utf-8
30+
payload_format: json
31+
broker_queue_name: ${SOLACE_AGENT_MESH_NAMESPACE}agent_rag_action_request
32+
broker_subscriptions:
33+
# Subscribe to all rag actions
34+
- topic: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/actionRequest/*/*/rag/>
35+
qos: 1
36+
37+
# Custom component to process the action request
38+
- component_name: action_request_processor
39+
component_base_path: .
40+
# path is completed at build time
41+
component_module: "{{MODULE_DIRECTORY}}.agents.rag.rag_agent_component"
42+
component_config:
43+
llm_service_topic: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/llm-service/request/general-good/
44+
embedding_service_topic: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/embedding-service/request/text/
45+
agent_name: rag
46+
47+
# Enhanced Scanner configuration with multiple sources
48+
scanner:
49+
batch: true
50+
use_memory_storage: true
51+
52+
# Multiple sources configuration (NEW FORMAT)
53+
sources:
54+
# File system source
55+
- type: filesystem
56+
directories:
57+
- "${LOCAL_DOCUMENTS_PATH}" # e.g. "/path/to/local/documents"
58+
filters:
59+
file_formats:
60+
- ".txt"
61+
- ".pdf"
62+
- ".docx"
63+
- ".doc"
64+
- ".md"
65+
- ".html"
66+
- ".csv"
67+
- ".json"
68+
- ".odt"
69+
- ".xlsx"
70+
- ".xls"
71+
max_file_size: 10240 # in KB (10MB)
72+
schedule:
73+
interval: 60 # seconds
74+
75+
# Google Drive source with Service Account Authentication
76+
- type: google_drive
77+
provider: google_drive
78+
auth_type: "service_account" # Use Service Account instead of OAuth2
79+
service_account_key_path: "${GOOGLE_SERVICE_ACCOUNT_KEY_PATH}" # e.g. "/path/to/service-account-key.json"
80+
folders:
81+
- folder_id: "${GOOGLE_DRIVE_FOLDER_ID_1}" # e.g. "1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms"
82+
name: "Documents"
83+
recursive: true
84+
type: "personal"
85+
- folder_id: "${GOOGLE_DRIVE_FOLDER_ID_2}" # e.g. "0AEd3EhGff_FmUk9PVA"
86+
name: "Shared Drive"
87+
recursive: true
88+
type: "shared_drive"
89+
filters:
90+
file_formats:
91+
- ".txt"
92+
- ".pdf"
93+
- ".docx"
94+
- ".doc"
95+
- ".md"
96+
- ".html"
97+
- ".csv"
98+
- ".json"
99+
- ".odt"
100+
- ".xlsx"
101+
- ".xls"
102+
max_file_size: 10240 # in KB (10MB)
103+
include_google_formats: true # Include Google Docs, Sheets, Slides
104+
real_time:
105+
enabled: true
106+
webhook_url: "${GOOGLE_DRIVE_WEBHOOK_URL}" # e.g. "https://your-domain.com/webhook/google-drive"
107+
polling_interval: 300 # Fallback polling in seconds (5 minutes)
108+
109+
# Legacy database configuration (optional for persistent metadata storage)
110+
database: # optional for persistent metadata storage
111+
#TODO: Deprecated and will be removed in the next version, enable use_memory_storage instead
112+
type: postgresql
113+
dbname: ${DB_NAME}
114+
host: ${DB_HOST}
115+
port: ${DB_PORT}
116+
user: ${DB_USER}
117+
password: ${DB_PASSWORD}
118+
119+
# Preprocessor configuration
120+
preprocessor:
121+
default_preprocessor:
122+
type: enhanced
123+
params:
124+
lowercase: true
125+
normalize_whitespace: true
126+
remove_stopwords: false
127+
remove_punctuation: false
128+
remove_numbers: false
129+
remove_non_ascii: false
130+
remove_urls: true
131+
remove_emails: false
132+
remove_html_tags: false
133+
134+
preprocessors:
135+
# Text file configurations
136+
text:
137+
type: text
138+
params:
139+
lowercase: true
140+
normalize_whitespace: true
141+
remove_stopwords: false
142+
remove_punctuation: true
143+
remove_numbers: false
144+
remove_non_ascii: false
145+
remove_urls: true
146+
remove_emails: false
147+
remove_html_tags: false
148+
149+
# Document file configurations
150+
pdf:
151+
type: document
152+
params:
153+
lowercase: true
154+
normalize_whitespace: true
155+
remove_stopwords: false
156+
remove_punctuation: true
157+
remove_numbers: false
158+
remove_non_ascii: true
159+
remove_urls: true
160+
remove_emails: true
161+
remove_html_tags: false
162+
163+
doc:
164+
type: document
165+
params:
166+
lowercase: true
167+
normalize_whitespace: true
168+
remove_stopwords: false
169+
remove_punctuation: true
170+
remove_numbers: false
171+
remove_non_ascii: true
172+
remove_urls: true
173+
remove_emails: true
174+
remove_html_tags: false
175+
176+
odt:
177+
type: document
178+
params:
179+
lowercase: true
180+
normalize_whitespace: true
181+
remove_stopwords: false
182+
remove_punctuation: true
183+
remove_numbers: false
184+
remove_non_ascii: true
185+
remove_urls: true
186+
remove_emails: true
187+
remove_html_tags: false
188+
189+
# Structured data configurations
190+
json:
191+
type: structured
192+
params:
193+
lowercase: true
194+
normalize_whitespace: true
195+
remove_stopwords: false
196+
remove_punctuation: false
197+
remove_numbers: false
198+
remove_non_ascii: false
199+
remove_urls: true
200+
remove_emails: true
201+
remove_html_tags: false
202+
203+
html:
204+
type: html
205+
params:
206+
lowercase: true
207+
normalize_whitespace: true
208+
remove_stopwords: false
209+
remove_punctuation: false
210+
remove_numbers: false
211+
remove_non_ascii: false
212+
remove_urls: true
213+
remove_emails: true
214+
remove_html_tags: false
215+
216+
markdown:
217+
type: markdown
218+
params:
219+
lowercase: true
220+
normalize_whitespace: true
221+
remove_stopwords: false
222+
remove_punctuation: false
223+
remove_numbers: false
224+
remove_non_ascii: false
225+
remove_urls: true
226+
remove_emails: true
227+
remove_html_tags: false
228+
229+
csv:
230+
type: csv
231+
params:
232+
lowercase: true
233+
normalize_whitespace: true
234+
remove_stopwords: false
235+
remove_punctuation: true
236+
remove_numbers: false
237+
remove_non_ascii: false
238+
remove_urls: true
239+
remove_emails: true
240+
remove_html_tags: false
241+
242+
xls:
243+
type: xls
244+
params:
245+
lowercase: true
246+
normalize_whitespace: true
247+
remove_stopwords: false
248+
remove_punctuation: true
249+
remove_numbers: false
250+
remove_non_ascii: false
251+
remove_urls: true
252+
remove_emails: true
253+
remove_html_tags: false
254+
255+
256+
# Text splitter configuration
257+
splitter:
258+
default_splitter:
259+
type: character
260+
params:
261+
chunk_size: 2048 # minimum chunk size
262+
chunk_overlap: 800
263+
separator: " "
264+
splitters:
265+
# Text file configurations
266+
text:
267+
type: character
268+
params:
269+
chunk_size: 2048 # minimum chunk size
270+
chunk_overlap: 800
271+
separator: " "
272+
is_separator_regex: false
273+
keep_separator: true
274+
strip_whitespace: true
275+
txt:
276+
type: character
277+
params:
278+
chunk_size: 2048 # minimum chunk size
279+
chunk_overlap: 800
280+
separator: "\n"
281+
is_separator_regex: false
282+
keep_separator: true
283+
strip_whitespace: true
284+
# Structured data configurations
285+
json:
286+
type: recursive_json
287+
params:
288+
chunk_size: 200
289+
chunk_overlap: 50
290+
html:
291+
type: html
292+
params:
293+
chunk_size: 2048
294+
chunk_overlap: 800
295+
tags_to_extract: ["p", "h1", "h2", "h3", "li"]
296+
markdown:
297+
type: markdown
298+
params:
299+
chunk_size: 2048
300+
chunk_overlap: 800
301+
headers_to_split_on: ["#", "##", "###", "####", "#####", "######"]
302+
strip_headers: false
303+
csv:
304+
type: csv
305+
params:
306+
chunk_size: 2048 # chunk size in number of rows
307+
include_header: false
308+
# Add Xml, Odt, Xlsx, and other formats as needed
309+
310+
# Embedding configuration
311+
embedding: # LiteLLM embedding
312+
embedder_type: "openai"
313+
embedder_params:
314+
# OpenAI embeddings
315+
model: ${OPENAI_EMBEDDING_MODEL}
316+
api_key: ${OPENAI_API_KEY}
317+
api_base: ${OPENAI_API_ENDPOINT}
318+
batch_size: 32
319+
additional_kwargs: {}
320+
normalize_embeddings: True
321+
322+
# Vector database configuration
323+
vector_db:
324+
# Qdrant
325+
db_type: "qdrant"
326+
db_params:
327+
url: ${QDRANT_URL}
328+
api_key: ${QDRANT_API_KEY}
329+
collection_name: ${QDRANT_COLLECTION}
330+
embedding_dimension: ${QDRANT_EMBEDDING_DIMENSION}
331+
332+
llm:
333+
load_balancer:
334+
- model_name: "gpt-4o" # model alias
335+
litellm_params:
336+
model: openai/${OPENAI_MODEL_NAME}
337+
api_key: ${OPENAI_API_KEY}
338+
api_base: ${OPENAI_API_ENDPOINT}
339+
temperature: 0.01
340+
# add any other parameters here
341+
- model_name: "claude-3-5-sonnet" # model alias
342+
litellm_params:
343+
model: anthropic/${ANTHROPIC_MODEL_NAME}
344+
api_key: ${ANTHROPIC_API_KEY}
345+
api_base: ${ANTHROPIC_API_ENDPOINT}
346+
# add any other parameters here
347+
# add more models here
348+
349+
retrieval:
350+
top_k: 7
351+
352+
broker_request_response:
353+
enabled: true
354+
broker_config: *broker_connection
355+
request_expiry_ms: 120000
356+
payload_encoding: utf-8
357+
payload_format: json
358+
response_topic_prefix: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1
359+
response_queue_prefix: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1
360+
component_input:
361+
source_expression: input.payload
362+
363+
# Output to a Solace broker
364+
- component_name: broker_output
365+
component_module: broker_output
366+
component_config:
367+
<<: *broker_connection
368+
payload_encoding: utf-8
369+
payload_format: json
370+
copy_user_properties: true

0 commit comments

Comments
 (0)