xynehq · junaid-shirur · Sep 19, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/server/.spec-workflow/user-templates/README.md b/server/.spec-workflow/user-templates/README.md
@@ -0,0 +1,64 @@
+# User Templates
+
+This directory allows you to create custom templates that override the default Spec Workflow templates.
+
+## How to Use Custom Templates
+
+1. **Create your custom template file** in this directory with the exact same name as the default template you want to override:
+   - `requirements-template.md` - Override requirements document template
+   - `design-template.md` - Override design document template  
+   - `tasks-template.md` - Override tasks document template
+   - `product-template.md` - Override product steering template
+   - `tech-template.md` - Override tech steering template
+   - `structure-template.md` - Override structure steering template
+
+2. **Template Loading Priority**:
+   - The system first checks this `user-templates/` directory
+   - If a matching template is found here, it will be used
+   - Otherwise, the default template from `templates/` will be used
+
+## Example Custom Template
+
+To create a custom requirements template:
+
+1. Create a file named `requirements-template.md` in this directory
+2. Add your custom structure, for example:
+
+```markdown
+# Requirements Document
+
+## Executive Summary
+[Your custom section]
+
+## Business Requirements
+[Your custom structure]
+
+## Technical Requirements
+[Your custom fields]
+
+## Custom Sections
+[Add any sections specific to your workflow]
+```
+
+## Template Variables
+
+Templates can include placeholders that will be replaced when documents are created:
+- `{{projectName}}` - The name of your project
+- `{{featureName}}` - The name of the feature being specified
+- `{{date}}` - The current date
+- `{{author}}` - The document author
+
+## Best Practices
+
+1. **Start from defaults**: Copy a default template from `../templates/` as a starting point
+2. **Keep structure consistent**: Maintain similar section headers for tool compatibility
+3. **Document changes**: Add comments explaining why sections were added/modified
+4. **Version control**: Track your custom templates in version control
+5. **Test thoroughly**: Ensure custom templates work with the spec workflow tools
+
+## Notes
+
+- Custom templates are project-specific and not included in the package distribution
+- The `templates/` directory contains the default templates which are updated with each version
+- Your custom templates in this directory are preserved during updates
+- If a custom template has errors, the system will fall back to the default template
@@ -2414,4 +2414,48 @@ Without these connections, I can only provide general assistance and cannot acce
 - Project-specific data
 - Company knowledge bases
 
-I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
+I'm still here to help with general questions, explanations, and tasks that don't require access to your personal workspace data. How can I assist you today?`
+
+// PDF Chunking Prompt
+// This prompt is used for OCR and semantic chunking of PDF pages using Gemini.
+export const CHUNKING_PROMPT = `\
+OCR the provided PDF page(s) into clean Markdown with enriched table and image handling, then segment into coherent RAG-ready chunks.
+
+GLOBAL RULES:
+- Preserve text structure as Markdown (headings, paragraphs, lists, footnotes).
+- Keep reading order across pages; prefer natural section boundaries.
+- No hallucinations. If content is unreadable, write [illegible].
+- Do not surround output with triple backticks or any code fences.
+- Output ONLY a sequence of <chunk>...</chunk> blocks. No extra commentary.
+
+TABLES (including tables shown inside images):
+- Extract ALL tables completely; never summarize or omit cells.
+- Represent EVERY table as HTML: <table><thead><tr><th>…</th></tr></thead><tbody><tr><td>…</td></tr>…</tbody></table>.
+- Keep the entire table within a single chunk when possible.
+- If a table must be split across chunks due to limits:
+  - Split on complete rows only; never split a cell.
+  - Repeat the full header row (<thead>) at the start of the next chunk.
+  - Add "(table continues)" at the end of the first part and "(table continued)" at the start of the next part.
+
+IMAGES, FIGURES, CHARTS, DIAGRAMS:
+- Insert an inline marker at the exact location where the image appears:
+  - Begin a new paragraph starting with "Image:" and provide a rich, thorough description.
+  - Describe the scene, axes, legends, units, labels, key values, trends, colors, shapes, and any text in the image.
+- If the image contains tabular data, transcribe it immediately after the description as an HTML <table> (same structure as above).
+- For charts, add 1–2 sentences summarizing key insights after the description.
+
+CHUNKING:
+- Group content by semantic theme (e.g., subsection, self-contained explanation, contiguous table).
+- Target 250–512 words per chunk with a hard maximum of 1024 bytes (UTF-8).
+- If 250–512 words would exceed 1024 bytes, end early to respect the byte limit and continue in the next chunk.
+- Do not break sentences, list items, or table rows across chunks unless unavoidable due to the byte limit.
+- When continuing content in the next chunk, begin with a brief "(continued)" cue to retain context.
+- Maintain flow: image descriptions and any extracted tables must appear inline where the image occurs so readers know an image was present there.
+
+FORMATTING:
+- Surround each chunk with <chunk> ... </chunk> tags.
+- Inside chunks, use valid Markdown and HTML (<table> only).
+- Keep whitespace clean; avoid double spaces and stray line breaks.
+
+Begin now and emit only <chunk> blocks.
+`
@@ -30,6 +30,7 @@ import { DeleteDocument } from "@/search/vespa"
 import type { VespaSchema } from "@xyne/vespa-ts/types"
 import config from "@/config"
 import { getErrorMessage } from "@/utils"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
 import {
   removeAppIntegrationFromAllAgents,
   getAgentsByDataSourceId,
@@ -197,6 +198,10 @@ export async function handleSingleFileUploadToDataSource(
         flag,
       },
     )
+    if (isDataSourceError(error)) {
+      // Preserve DataSourceError so UI can display error.userMessage
+      throw error
+    }
     if (
       error instanceof Error &&
       (error.message.includes("already exists") ||

@@ -23,6 +23,7 @@ import type { AttachmentMetadata } from "@/shared/types"
 import { FileProcessorService } from "@/services/fileProcessor"
 import { Apps, KbItemsSchema, KnowledgeBaseEntity } from "@xyne/vespa-ts/types"
 import { getBaseMimeType } from "@/integrations/dataSource/config"
+import { isDataSourceError } from "@/integrations/dataSource/errors"
 
 const { JwtPayloadKey } = config
 const loggerWithChild = getLoggerWithChild(Subsystem.Api, { module: "newApps" })
@@ -139,9 +140,11 @@ export const handleFileUpload = async (c: Context) => {
         )
       } catch (error) {
         const errorMessage =
-          error instanceof Error
-            ? error.message
-            : "Unknown error during DataSource processing"
+          isDataSourceError(error)
+            ? error.userMessage
+            : error instanceof Error
+              ? error.message
+              : "Unknown error during DataSource processing"
         loggerWithChild({ email: email }).error(
           error,
           `Error processing file "${file.name}" for DataSource`,

@@ -26,6 +26,16 @@ export class FileSizeExceededError extends FileValidationError {
   }
 }
 
+// Specific PDF validation error when a single page exceeds client-side processing limits
+export class PdfPageTooLargeError extends FileValidationError {
+  constructor(pageNumber: number, maxSizeMB: number, actualBytes: number) {
+    const actualMB = actualBytes / (1024 * 1024)
+    const message = `PDF page ${pageNumber} size ${actualMB.toFixed(2)}MB exceeds maximum allowed per-page limit of ${maxSizeMB}MB`
+    const userMessage = `One page in the PDF is too large (${actualMB.toFixed(2)}MB). Please compress or split the PDF so each page is under ${maxSizeMB}MB.`
+    super(message, userMessage)
+  }
+}
+
 export class UnsupportedFileTypeError extends FileValidationError {
   constructor(mimeType: string, supportedTypes: string[]) {
     const message = `Unsupported file type: ${mimeType}`

@@ -32,7 +32,7 @@ import {
 } from "./errors"
 import { describeImageWithllm } from "@/lib/describeImageWithllm"
 import { promises as fsPromises } from "fs"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
 import imageType from "image-type"
@@ -208,7 +208,7 @@ const processPdfContent = async (
   try {
     const docId = `dsf-${createId()}`
     const { text_chunks, image_chunks, text_chunk_pos, image_chunk_pos } =
-      await extractTextAndImagesWithChunksFromPDF(pdfBuffer, docId, true)
+      await extractTextAndImagesWithChunksFromPDFviaGemini(pdfBuffer, docId)
     if (text_chunks.length === 0 && image_chunks.length === 0) {
       throw new ContentExtractionError(
         "No chunks generated from PDF content",

@@ -19,7 +19,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 
 const Logger = getLogger(Subsystem.Integrations).child({ module: "google" })
 
@@ -49,10 +49,9 @@ const processPdfFile = async (
 ): Promise<string[]> => {
   try {
     // Handle non-spreadsheet files as before
-    const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
       pdfBuffer,
       attachmentId,
-      false, // Don't extract images for email attachments
     )
     return pdfResult.text_chunks.filter((v) => v.trim())
   } catch (error) {

@@ -17,7 +17,7 @@ import {
 import * as XLSX from "xlsx"
 import { extractTextAndImagesWithChunksFromDocx } from "@/docxChunks"
 import { extractTextAndImagesWithChunksFromPptx } from "@/pptChunks"
-import { extractTextAndImagesWithChunksFromPDF } from "@/pdfChunks"
+import { extractTextAndImagesWithChunksFromPDFviaGemini } from "@/lib/chunkPdfWithGemini"
 import { makeGraphApiCall, type MicrosoftGraphClient } from "./client"
 
 const Logger = getLogger(Subsystem.Integrations).child({
@@ -48,10 +48,9 @@ const processPdfFile = async (
   attachmentId: string,
 ): Promise<string[]> => {
   try {
-    const pdfResult = await extractTextAndImagesWithChunksFromPDF(
+    const pdfResult = await extractTextAndImagesWithChunksFromPDFviaGemini(
       pdfBuffer,
       attachmentId,
-      false, // Don't extract images for email attachments
     )
     return pdfResult.text_chunks.filter((v) => v.trim())
   } catch (error) {