Skip to content
This repository was archived by the owner on Apr 20, 2020. It is now read-only.

Filter documents for searching #10

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions elasticsearch-aknn/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM docker.elastic.co/elasticsearch/elasticsearch:6.2.4

ADD . /aknn
WORKDIR /aknn


# Install Java 10
# Before building this image, download the .rpm files to elasticsearch-aknn directory
# from http://www.oracle.com/technetwork/java/javase/downloads/index.html

#https://www.oracle.com/technetwork/java/javase/downloads/java-archive-javase10-4425482.html

RUN yum -y install jdk-10.0.2_linux-x64_bin.rpm
RUN yum -y install jre-10.0.2_linux-x64_bin.rpm
ENV JAVA_HOME=/usr/java/jdk-10.0.2/
RUN rm jdk-10.0.2_linux-x64_bin.rpm
RUN rm jre-10.0.2_linux-x64_bin.rpm


# Install gradle 4.9

RUN wget https://services.gradle.org/distributions/gradle-4.9-bin.zip
RUN mkdir /opt/gradle
RUN unzip -d /opt/gradle gradle-4.9-bin.zip
ENV PATH=$PATH:/opt/gradle/gradle-4.9/bin


# Build & install the plugin

RUN gradle clean build -x integTestRunner -x test
RUN elasticsearch-plugin install -b file:build/distributions/elasticsearch-aknn-0.0.1-SNAPSHOT.zip

# Configure ElasticSearch
ENV ES_JAVA_OPTS="-Xms10g -Xmx10g"
44 changes: 44 additions & 0 deletions elasticsearch-aknn/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
version: '3'
services:
elasticsearch:
image: jainaayush05/es-aknn:latest
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata1:/usr/share/elasticsearch/data
ports:
- 9200:9200
networks:
- esnet
elasticsearch2:
image: jainaayush05/es-aknn:latest
container_name: elasticsearch2
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- "discovery.zen.ping.unicast.hosts=elasticsearch"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- esdata2:/usr/share/elasticsearch/data
networks:
- esnet

volumes:
esdata1:
driver: local
esdata2:
driver: local

networks:
esnet:
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.WrapperQueryBuilder;
import org.elasticsearch.rest.BaseRestHandler;
import org.elasticsearch.rest.BytesRestResponse;
import org.elasticsearch.rest.RestController;
Expand Down Expand Up @@ -61,6 +62,7 @@ public class AknnRestAction extends BaseRestHandler {
private final String VECTOR_KEY = "_aknn_vector";
private final Integer K1_DEFAULT = 99;
private final Integer K2_DEFAULT = 10;
private final String FILTER_DEFAULT = "{}";

// TODO: add an option to the index endpoint handler that empties the cache.
private Map<String, LshModel> lshModelCache = new HashMap<>();
Expand Down Expand Up @@ -95,6 +97,30 @@ public static Double euclideanDistance(List<Double> A, List<Double> B) {
return Math.sqrt(squaredDistance);
}

public static Double cosineDistance(List<Double> A, List<Double> B) {
Double dotProduct = 0.;
Double magnitude1 = 0.;
Double magnitude2 = 0.;
Double cosineSimilarity = 0.;

for (Integer i = 0; i < A.size(); i++){
dotProduct += A.get(i) * B.get(i); //a.b
magnitude1 += Math.pow(A.get(i), 2); //(a^2)
magnitude2 += Math.pow(B.get(i), 2); //(b^2)
}

magnitude1 = Math.sqrt(magnitude1);//sqrt(a^2)
magnitude2 = Math.sqrt(magnitude2);//sqrt(b^2)

if (magnitude1 != 0.0 | magnitude2 != 0.0) {
cosineSimilarity = dotProduct / (magnitude1 * magnitude2);
} else {
return 0.0;
}
return cosineSimilarity;

}

private RestChannelConsumer handleSearchRequest(RestRequest restRequest, NodeClient client) throws IOException {

StopWatch stopWatch = new StopWatch("StopWatch to Time Search Request");
Expand All @@ -106,6 +132,7 @@ private RestChannelConsumer handleSearchRequest(RestRequest restRequest, NodeCli
final String id = restRequest.param("id");
final Integer k1 = restRequest.paramAsInt("k1", K1_DEFAULT);
final Integer k2 = restRequest.paramAsInt("k2", K2_DEFAULT);
final String f = restRequest.param("filter", FILTER_DEFAULT);
stopWatch.stop();

logger.info("Get query document at {}/{}/{}", index, type, id);
Expand All @@ -126,12 +153,13 @@ private RestChannelConsumer handleSearchRequest(RestRequest restRequest, NodeCli
stopWatch.stop();

// Retrieve the documents with most matching hashes. https://stackoverflow.com/questions/10773581
// http://javadoc.kyubu.de/elasticsearch/HEAD/org/elasticsearch/index/query/WrapperQueryBuilder.html
logger.info("Build boolean query from hashes");
stopWatch.start("Build boolean query from hashes");
QueryBuilder queryBuilder = QueryBuilders.boolQuery();
for (Map.Entry<String, Long> entry : queryHashes.entrySet()) {
String termKey = HASHES_KEY + "." + entry.getKey();
((BoolQueryBuilder) queryBuilder).should(QueryBuilders.termQuery(termKey, entry.getValue()));
((BoolQueryBuilder) queryBuilder).filter(new WrapperQueryBuilder(f)).should(QueryBuilders.termQuery(termKey, entry.getValue()));
}
stopWatch.stop();

Expand Down Expand Up @@ -161,7 +189,7 @@ private RestChannelConsumer handleSearchRequest(RestRequest restRequest, NodeCli
put("_index", hit.getIndex());
put("_id", hit.getId());
put("_type", hit.getType());
put("_score", euclideanDistance(queryVector, hitVector));
put("_score", cosineDistance(queryVector, hitVector));
put("_source", hitSource);
}});
}
Expand Down