Skip to content

Commit 42e9b84

Browse files
committed
feat: Add Mistral AI OCR API integration
This commit introduces the Mistral AI OCR API integration to Spring AI. - Added `MistralOcrApi` class for interacting with the Mistral OCR endpoint. - Added `MistralOcrApiIT` integration test. - Updated documentation to include OCR API usage instructions. This allows users to extract text and image data from documents using Mistral AI's OCR capabilities. Announcement: https://mistral.ai/news/mistral-ocr/ Signed-off-by: Alexandros Pappas <apappascs@gmail.com>
1 parent 6b25b62 commit 42e9b84

File tree

3 files changed

+414
-0
lines changed

3 files changed

+414
-0
lines changed
Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
/*
2+
* Copyright 2025-2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.mistralai.api;
18+
19+
import java.util.List;
20+
import java.util.Objects;
21+
import java.util.function.Consumer;
22+
23+
import com.fasterxml.jackson.annotation.JsonInclude;
24+
import com.fasterxml.jackson.annotation.JsonInclude.Include;
25+
import com.fasterxml.jackson.annotation.JsonProperty;
26+
27+
import org.springframework.ai.retry.RetryUtils;
28+
import org.springframework.http.HttpHeaders;
29+
import org.springframework.http.MediaType;
30+
import org.springframework.http.ResponseEntity;
31+
import org.springframework.util.Assert;
32+
import org.springframework.web.client.ResponseErrorHandler;
33+
import org.springframework.web.client.RestClient;
34+
35+
/**
36+
* Java Client library for the Mistral AI OCR API. Provides access to the OCR
37+
* functionality.
38+
* <p>
39+
* The API processes a document and returns a markdown string representation of the text,
40+
* along with information about extracted images.
41+
*
42+
* @author Alexandros Pappas
43+
*/
44+
public class MistralOcrApi {
45+
46+
private static final String DEFAULT_BASE_URL = "https://api.mistral.ai";
47+
48+
private final RestClient restClient;
49+
50+
/**
51+
* Create a new MistralOcrApi instance.
52+
* @param mistralAiApiKey Mistral AI API key.
53+
*/
54+
public MistralOcrApi(String mistralAiApiKey) {
55+
this(DEFAULT_BASE_URL, mistralAiApiKey);
56+
}
57+
58+
/**
59+
* Create a new MistralOcrApi instance.
60+
* @param baseUrl API base URL.
61+
* @param mistralAiApiKey Mistral AI API key.
62+
*/
63+
public MistralOcrApi(String baseUrl, String mistralAiApiKey) {
64+
this(baseUrl, mistralAiApiKey, RestClient.builder());
65+
}
66+
67+
/**
68+
* Create a new MistralOcrApi instance.
69+
* @param baseUrl API base URL.
70+
* @param mistralAiApiKey Mistral AI API key.
71+
* @param restClientBuilder RestClient builder.
72+
*/
73+
public MistralOcrApi(String baseUrl, String mistralAiApiKey, RestClient.Builder restClientBuilder) {
74+
this(baseUrl, mistralAiApiKey, restClientBuilder, RetryUtils.DEFAULT_RESPONSE_ERROR_HANDLER);
75+
}
76+
77+
/**
78+
* Create a new MistralOcrApi instance.
79+
* @param baseUrl API base URL.
80+
* @param mistralAiApiKey Mistral AI API key.
81+
* @param restClientBuilder RestClient builder.
82+
* @param responseErrorHandler Response error handler.
83+
*/
84+
public MistralOcrApi(String baseUrl, String mistralAiApiKey, RestClient.Builder restClientBuilder,
85+
ResponseErrorHandler responseErrorHandler) {
86+
87+
Consumer<HttpHeaders> jsonContentHeaders = headers -> {
88+
headers.setBearerAuth(mistralAiApiKey);
89+
headers.setContentType(MediaType.APPLICATION_JSON);
90+
};
91+
92+
this.restClient = restClientBuilder.baseUrl(baseUrl)
93+
.defaultHeaders(jsonContentHeaders)
94+
.defaultStatusHandler(responseErrorHandler)
95+
.build();
96+
}
97+
98+
/**
99+
* Performs OCR on a document and returns the extracted information.
100+
* @param ocrRequest The OCR request containing document details and processing
101+
* options.
102+
* @return ResponseEntity containing the OCR response with markdown text and image
103+
* data.
104+
*/
105+
public ResponseEntity<OCRResponse> ocr(OCRRequest ocrRequest) {
106+
107+
Assert.notNull(ocrRequest, "The request body can not be null.");
108+
Assert.notNull(ocrRequest.model(), "The model can not be null.");
109+
Assert.notNull(ocrRequest.document(), "The document can not be null.");
110+
111+
return this.restClient.post().uri("/v1/ocr").body(ocrRequest).retrieve().toEntity(OCRResponse.class);
112+
}
113+
114+
/**
115+
* List of well-known Mistral OCR models.
116+
*/
117+
public enum OCRModel {
118+
119+
MISTRAL_OCR_LATEST("mistral-ocr-latest");
120+
121+
private final String value;
122+
123+
OCRModel(String value) {
124+
this.value = value;
125+
}
126+
127+
public String getValue() {
128+
return value;
129+
}
130+
131+
}
132+
133+
/**
134+
* Represents the request for the OCR API.
135+
*
136+
* @param model Model to use for OCR. Can be 'mistral-ocr-latest'
137+
* @param id An optional string identifier.
138+
* @param document Document to run OCR on. Can be either a {@link DocumentURLChunk} or
139+
* an {@link ImageURLChunk}.
140+
* @param pages Specific pages to process in various formats: single number, range, or
141+
* list of both. Starts from 0.
142+
* @param includeImageBase64 Whether to include image URLs in the response.
143+
* @param imageLimit Maximum number of images to extract.
144+
* @param imageMinSize Minimum height and width of image to extract.
145+
*/
146+
@JsonInclude(Include.NON_NULL)
147+
public record OCRRequest(@JsonProperty("model") String model, @JsonProperty("id") String id,
148+
@JsonProperty("document") Document document, @JsonProperty("pages") List<Integer> pages,
149+
@JsonProperty("include_image_base64") Boolean includeImageBase64,
150+
@JsonProperty("image_limit") Integer imageLimit, @JsonProperty("image_min_size") Integer imageMinSize) {
151+
152+
/**
153+
* Create an OCRRequest.
154+
* @param model The model to use for OCR.
155+
* @param document Document to run OCR on.
156+
*/
157+
public OCRRequest(String model, Document document) {
158+
this(model, null, document, null, null, null, null);
159+
}
160+
161+
/**
162+
* Represents the document to be processed, which can be either a document URL or
163+
* an image URL. Only one of the fields should be set.
164+
*/
165+
@JsonInclude(Include.NON_NULL)
166+
public sealed interface Document permits DocumentURLChunk, ImageURLChunk {
167+
168+
}
169+
170+
/**
171+
* Represents a document URL chunk.
172+
*
173+
* @param type Must be 'document_url'.
174+
* @param documentUrl URL of the document.
175+
* @param documentName Optional name of the document.
176+
*/
177+
@JsonInclude(Include.NON_NULL)
178+
public record DocumentURLChunk(
179+
180+
@JsonProperty("type") String type, @JsonProperty("document_url") String documentUrl,
181+
@JsonProperty("document_name") String documentName) implements Document {
182+
183+
/**
184+
* Create a DocumentURLChunk.
185+
* @param documentUrl URL of the document.
186+
*/
187+
public DocumentURLChunk(String documentUrl) {
188+
this("document_url", documentUrl, null);
189+
}
190+
}
191+
192+
/**
193+
* Represents an image URL chunk.
194+
*
195+
* @param type Must be 'image_url'.
196+
* @param imageUrl URL of the image.
197+
* @param imageName Optional name of the image.
198+
*/
199+
@JsonInclude(Include.NON_NULL)
200+
public record ImageURLChunk(
201+
202+
@JsonProperty("type") String type, @JsonProperty("image_url") String imageUrl,
203+
@JsonProperty("image_name") String imageName) implements Document {
204+
205+
/**
206+
* Create an ImageURLChunk.
207+
* @param imageUrl URL of the image.
208+
*/
209+
public ImageURLChunk(String imageUrl) {
210+
this("image_url", imageUrl, null);
211+
}
212+
}
213+
}
214+
215+
/**
216+
* Represents the response from the OCR API.
217+
*
218+
* @param pages List of OCR info for pages.
219+
* @param model The model used to generate the OCR.
220+
* @param usageInfo Usage info for the OCR request.
221+
* @param pagesProcessed Number of pages processed.
222+
* @param docSizeBytes Document size in bytes.
223+
*/
224+
@JsonInclude(Include.NON_NULL)
225+
public record OCRResponse(@JsonProperty("pages") List<OCRPage> pages, @JsonProperty("model") String model,
226+
@JsonProperty("usage_info") OCRUsageInfo usageInfo, @JsonProperty("pages_processed") Integer pagesProcessed,
227+
@JsonProperty("doc_size_bytes") Integer docSizeBytes) {
228+
229+
}
230+
231+
/**
232+
* Represents OCR information for a single page.
233+
*
234+
* @param index The page index in a PDF document starting from 0.
235+
* @param markdown The markdown string response of the page.
236+
* @param images List of all extracted images in the page.
237+
* @param dimensions The dimensions of the PDF Page's screenshot image.
238+
*/
239+
@JsonInclude(Include.NON_NULL)
240+
public record OCRPage(@JsonProperty("index") Integer index, @JsonProperty("markdown") String markdown,
241+
@JsonProperty("images") List<ExtractedImage> images,
242+
@JsonProperty("dimensions") OCRPageDimensions dimensions) {
243+
}
244+
245+
/**
246+
* Represents an extracted image from a page.
247+
*
248+
* @param id Image ID for the extracted image in a page.
249+
* @param topLeftX X coordinate of the top-left corner of the extracted image.
250+
* @param topLeftY Y coordinate of the top-left corner of the extracted image.
251+
* @param bottomRightX X coordinate of the bottom-right corner of the extracted image.
252+
* @param bottomRightY Y coordinate of the bottom-right corner of the extracted image.
253+
* @param imageBase64 Base64 string of the extracted image.
254+
*/
255+
@JsonInclude(Include.NON_NULL)
256+
public record ExtractedImage(@JsonProperty("id") String id, @JsonProperty("top_left_x") Integer topLeftX,
257+
@JsonProperty("top_left_y") Integer topLeftY, @JsonProperty("bottom_right_x") Integer bottomRightX,
258+
@JsonProperty("bottom_right_y") Integer bottomRightY, @JsonProperty("image_base64") String imageBase64) {
259+
260+
@Override
261+
public boolean equals(Object o) {
262+
if (this == o)
263+
return true;
264+
if (!(o instanceof ExtractedImage that))
265+
return false;
266+
return Objects.equals(id, that.id) && Objects.equals(topLeftX, that.topLeftX)
267+
&& Objects.equals(topLeftY, that.topLeftY) && Objects.equals(bottomRightX, that.bottomRightX)
268+
&& Objects.equals(bottomRightY, that.bottomRightY) && Objects.equals(imageBase64, that.imageBase64);
269+
}
270+
271+
@Override
272+
public int hashCode() {
273+
return Objects.hash(id, topLeftX, topLeftY, bottomRightX, bottomRightY, imageBase64);
274+
}
275+
}
276+
277+
/**
278+
* Represents the dimensions of a PDF page's screenshot image.
279+
*
280+
* @param dpi Dots per inch of the page-image.
281+
* @param height Height of the image in pixels.
282+
* @param width Width of the image in pixels.
283+
*/
284+
@JsonInclude(Include.NON_NULL)
285+
public record OCRPageDimensions(@JsonProperty("dpi") Integer dpi, @JsonProperty("height") Integer height,
286+
@JsonProperty("width") Integer width) {
287+
}
288+
289+
/**
290+
* Represents usage information for the OCR request.
291+
*
292+
* @param pagesProcessed Number of pages processed.
293+
* @param docSizeBytes Document size in bytes.
294+
*/
295+
@JsonInclude(Include.NON_NULL)
296+
public record OCRUsageInfo(@JsonProperty("pages_processed") Integer pagesProcessed,
297+
@JsonProperty("doc_size_bytes") Integer docSizeBytes) {
298+
}
299+
300+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Copyright 2025-2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.mistralai.api;
18+
19+
import java.util.List;
20+
21+
import org.junit.jupiter.api.Test;
22+
import org.springframework.http.ResponseEntity;
23+
24+
import static org.assertj.core.api.Assertions.assertThat;
25+
import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable;
26+
27+
/**
28+
* Tests for the Mistral OCR API.
29+
*
30+
* @author Alexandros Pappas
31+
*/
32+
@EnabledIfEnvironmentVariable(named = "MISTRAL_AI_API_KEY", matches = ".+")
33+
class MistralOcrApiIT {
34+
35+
MistralOcrApi mistralOcr = new MistralOcrApi(System.getenv("MISTRAL_AI_API_KEY"));
36+
37+
@Test
38+
void ocrTest() {
39+
String documentUrl = "https://arxiv.org/pdf/2201.04234";
40+
MistralOcrApi.OCRRequest request = new MistralOcrApi.OCRRequest(
41+
MistralOcrApi.OCRModel.MISTRAL_OCR_LATEST.getValue(), "test_id",
42+
new MistralOcrApi.OCRRequest.DocumentURLChunk(documentUrl), List.of(0, 1, 2), true, 5, 50);
43+
44+
ResponseEntity<MistralOcrApi.OCRResponse> response = mistralOcr.ocr(request);
45+
46+
assertThat(response).isNotNull();
47+
assertThat(response.getBody()).isNotNull();
48+
assertThat(response.getBody().pages()).isNotNull();
49+
assertThat(response.getBody().pages()).isNotEmpty();
50+
assertThat(response.getBody().pages().get(0).markdown()).isNotEmpty();
51+
52+
if (request.includeImageBase64() != null && request.includeImageBase64()) {
53+
assertThat(response.getBody().pages().get(1).images()).isNotNull();
54+
assertThat(response.getBody().pages().get(1).images().get(0).imageBase64()).isNotNull();
55+
}
56+
System.out.println(response);
57+
}
58+
59+
}

0 commit comments

Comments
 (0)