Skip to content

Commit b0d8d2b

Browse files
nurlichttzolov
authored andcommitted
Add document Id generator
- Clean uneccessary classes/interfaces - straighten code style. - Clean tests. Resolves #113
1 parent 2704d53 commit b0d8d2b

File tree

6 files changed

+315
-3
lines changed

6 files changed

+315
-3
lines changed

spring-ai-core/src/main/java/org/springframework/ai/document/Document.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2023 the original author or authors.
2+
* Copyright 2023-2024 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -20,15 +20,21 @@
2020
import java.util.HashMap;
2121
import java.util.List;
2222
import java.util.Map;
23-
import java.util.UUID;
2423

2524
import com.fasterxml.jackson.annotation.JsonCreator;
2625
import com.fasterxml.jackson.annotation.JsonIgnore;
2726
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
2827
import com.fasterxml.jackson.annotation.JsonProperty;
2928

29+
import org.springframework.ai.document.id.IdGenerator;
30+
import org.springframework.ai.document.id.RandomIdGenerator;
3031
import org.springframework.util.Assert;
3132

33+
/**
34+
* A document is a container for the content and metadata of a document. It also contains
35+
* the document's unique ID and an optional embedding.
36+
* <p>
37+
*/
3238
@JsonIgnoreProperties({ "contentFormatter" })
3339
public class Document {
3440

@@ -68,7 +74,11 @@ public Document(@JsonProperty("content") String content) {
6874
}
6975

7076
public Document(String content, Map<String, Object> metadata) {
71-
this(UUID.randomUUID().toString(), content, metadata);
77+
this(content, metadata, new RandomIdGenerator());
78+
}
79+
80+
public Document(String content, Map<String, Object> metadata, IdGenerator idGenerator) {
81+
this(idGenerator.generateId(content, metadata), content, metadata);
7282
}
7383

7484
public Document(String id, String content, Map<String, Object> metadata) {
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
* Copyright 2024-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.springframework.ai.document.id;
17+
18+
/**
19+
* Interface for generating unique document IDs.
20+
*
21+
* @author Aliakbar Jafarpour
22+
* @author Christian Tzolov
23+
*/
24+
public interface IdGenerator {
25+
26+
/**
27+
* Generate a unique ID for the given content. Note: some generator, such as the the
28+
* random generator might not dependant on or use the content parameters.
29+
* @param contents the content to generate an ID for.
30+
* @return the generated ID.
31+
*/
32+
String generateId(Object... contents);
33+
34+
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* Copyright 2024-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.document.id;
18+
19+
import java.io.ByteArrayOutputStream;
20+
import java.io.ObjectOutputStream;
21+
import java.nio.charset.Charset;
22+
import java.nio.charset.StandardCharsets;
23+
import java.security.MessageDigest;
24+
import java.security.NoSuchAlgorithmException;
25+
import java.util.UUID;
26+
27+
import org.springframework.util.Assert;
28+
29+
/**
30+
* A SHA-256 based ID generator that returns the hash as a UUID.
31+
*
32+
* @author Aliakbar Jafarpour
33+
* @author Christian Tzolov
34+
*/
35+
public class JdkSha256HexIdGenerator implements IdGenerator {
36+
37+
private static final String SHA_256 = "SHA-256";
38+
39+
private final String byteHexFormat = "%02x";
40+
41+
private final Charset charset;
42+
43+
private final MessageDigest messageDigest;
44+
45+
public JdkSha256HexIdGenerator(final String algorithm, final Charset charset) {
46+
this.charset = charset;
47+
try {
48+
this.messageDigest = MessageDigest.getInstance(algorithm);
49+
}
50+
catch (NoSuchAlgorithmException e) {
51+
throw new IllegalArgumentException(e);
52+
}
53+
}
54+
55+
public JdkSha256HexIdGenerator() {
56+
this(SHA_256, StandardCharsets.UTF_8);
57+
}
58+
59+
@Override
60+
public String generateId(Object... contents) {
61+
return this.hash(this.serializeToBytes(contents));
62+
}
63+
64+
// https://github.com/spring-projects/spring-ai/issues/113#issue-2000373318
65+
private String hash(byte[] contentWithMetadata) {
66+
byte[] hashBytes = getMessageDigest().digest(contentWithMetadata);
67+
StringBuilder sb = new StringBuilder();
68+
for (byte b : hashBytes) {
69+
sb.append(String.format(this.byteHexFormat, b));
70+
}
71+
return UUID.nameUUIDFromBytes(sb.toString().getBytes(this.charset)).toString();
72+
}
73+
74+
private byte[] serializeToBytes(Object... contents) {
75+
Assert.notNull(contents, "Contents must not be null");
76+
ByteArrayOutputStream byteOut = null;
77+
try {
78+
byteOut = new ByteArrayOutputStream();
79+
ObjectOutputStream out = new ObjectOutputStream(byteOut);
80+
for (Object content : contents) {
81+
out.writeObject(content);
82+
}
83+
return byteOut.toByteArray();
84+
}
85+
catch (Exception e) {
86+
throw new RuntimeException("Failed to serialize", e);
87+
}
88+
finally {
89+
if (byteOut != null) {
90+
try {
91+
byteOut.close();
92+
}
93+
catch (Exception e) {
94+
// ignore
95+
}
96+
}
97+
}
98+
}
99+
100+
MessageDigest getMessageDigest() {
101+
try {
102+
return (MessageDigest) messageDigest.clone();
103+
}
104+
catch (CloneNotSupportedException e) {
105+
throw new RuntimeException("Unsupported clone for MessageDigest.", e);
106+
}
107+
}
108+
109+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright 2024-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.springframework.ai.document.id;
17+
18+
import java.util.UUID;
19+
20+
/**
21+
* A random ID generator that returns a UUID.
22+
*
23+
* @author Aliakbar Jafarpour
24+
* @author Christian Tzolov
25+
*/
26+
public class RandomIdGenerator implements IdGenerator {
27+
28+
@Override
29+
public String generateId(Object... contents) {
30+
return UUID.randomUUID().toString();
31+
}
32+
33+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright 2024-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.springframework.ai.document.id;
17+
18+
import java.util.Map;
19+
import java.util.Set;
20+
import java.util.UUID;
21+
22+
import org.junit.jupiter.api.Assertions;
23+
import org.junit.jupiter.api.Test;
24+
25+
public class IdGeneratorProviderTest {
26+
27+
@Test
28+
void hashGeneratorGenerateSimilarIdsForSimilarContent() {
29+
30+
var idGenerator1 = new JdkSha256HexIdGenerator();
31+
var idGenerator2 = new JdkSha256HexIdGenerator();
32+
33+
final String content = "Content";
34+
final Map<String, Object> metadata = Map.of("metadata", Set.of("META_DATA"));
35+
36+
String actualHashes1 = idGenerator1.generateId(content, metadata);
37+
String actualHashes2 = idGenerator2.generateId(content, metadata);
38+
39+
Assertions.assertEquals(actualHashes1, actualHashes2);
40+
41+
// Assert (other expected behaviors)
42+
Assertions.assertDoesNotThrow(() -> UUID.fromString(actualHashes1));
43+
Assertions.assertDoesNotThrow(() -> UUID.fromString(actualHashes2));
44+
}
45+
46+
@Test
47+
void hashGeneratorGenerateDifferentIdsForDifferentContent() {
48+
49+
var idGenerator1 = new JdkSha256HexIdGenerator();
50+
var idGenerator2 = new JdkSha256HexIdGenerator();
51+
52+
final String content1 = "Content";
53+
final Map<String, Object> metadata1 = Map.of("metadata", Set.of("META_DATA"));
54+
final String content2 = content1 + " ";
55+
final Map<String, Object> metadata2 = metadata1;
56+
57+
String actualHashes1 = idGenerator1.generateId(content1, metadata1);
58+
String actualHashes2 = idGenerator2.generateId(content2, metadata2);
59+
60+
Assertions.assertNotEquals(actualHashes1, actualHashes2);
61+
62+
// Assert (other expected behaviors)
63+
Assertions.assertDoesNotThrow(() -> UUID.fromString(actualHashes1));
64+
Assertions.assertDoesNotThrow(() -> UUID.fromString(actualHashes2));
65+
}
66+
67+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Copyright 2024-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package org.springframework.ai.document.id;
17+
18+
import java.nio.charset.Charset;
19+
import java.nio.charset.StandardCharsets;
20+
import java.security.MessageDigest;
21+
22+
import org.assertj.core.api.Assertions;
23+
import org.junit.jupiter.api.Test;
24+
25+
public class JdkSha256HexIdGeneratorTest {
26+
27+
private final JdkSha256HexIdGenerator testee = new JdkSha256HexIdGenerator();
28+
29+
@Test
30+
void messageDigestReturnsDistinctInstances() {
31+
final MessageDigest md1 = testee.getMessageDigest();
32+
final MessageDigest md2 = testee.getMessageDigest();
33+
34+
Assertions.assertThat(md1 != md2).isTrue();
35+
36+
Assertions.assertThat(md1.getAlgorithm()).isEqualTo(md2.getAlgorithm());
37+
Assertions.assertThat(md1.getDigestLength()).isEqualTo(md2.getDigestLength());
38+
Assertions.assertThat(md1.getProvider()).isEqualTo(md2.getProvider());
39+
Assertions.assertThat(md1.toString()).isEqualTo(md2.toString());
40+
}
41+
42+
@Test
43+
void messageDigestReturnsInstancesWithIndependentAndReproducibleDigests() {
44+
final String updateString1 = "md1_update";
45+
final String updateString2 = "md2_update";
46+
final Charset charset = StandardCharsets.UTF_8;
47+
48+
final byte[] md1BytesFirstTry = testee.getMessageDigest().digest(updateString1.getBytes(charset));
49+
final byte[] md2BytesFirstTry = testee.getMessageDigest().digest(updateString2.getBytes(charset));
50+
final byte[] md1BytesSecondTry = testee.getMessageDigest().digest(updateString1.getBytes(charset));
51+
final byte[] md2BytesSecondTry = testee.getMessageDigest().digest(updateString2.getBytes(charset));
52+
53+
Assertions.assertThat(md1BytesFirstTry).isNotEqualTo(md2BytesFirstTry);
54+
55+
Assertions.assertThat(md1BytesFirstTry).isEqualTo(md1BytesSecondTry);
56+
Assertions.assertThat(md2BytesFirstTry).isEqualTo(md2BytesSecondTry);
57+
}
58+
59+
}

0 commit comments

Comments
 (0)