Skip to content

Commit f9dc268

Browse files
committed
Add spellchecking to Solr. Fixes #60.
1 parent 2e288fe commit f9dc268

File tree

9 files changed

+146
-47
lines changed

9 files changed

+146
-47
lines changed

docker/solr/conf/managed-schema

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
<!-- Combine the "title" and "abstract" fields so that we search both when performing basic search -->
125125
<copyField source="title" dest="basic_search" />
126126
<copyField source="abstract" dest="basic_search" />
127-
<field name="basic_search" type="text_general" />
127+
<field name="basic_search" type="text_general" indexed="true" stored="false" />
128128

129129
<!--
130130
END COMPENDIUM FIELDS

docker/solr/conf/solrconfig.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@
849849
<!-- a spellchecker built from a field of the main index -->
850850
<lst name="spellchecker">
851851
<str name="name">default</str>
852-
<str name="field">_text_</str>
852+
<str name="field">title</str>
853853
<str name="classname">solr.DirectSolrSpellChecker</str>
854854
<!-- the spellcheck distance measure used, the default is the internal levenshtein -->
855855
<str name="distanceMeasure">internal</str>

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ django[argon2] == 3.0.3
55
gunicorn == 20.0.4
66
psycopg2-binary == 2.8.4
77
pylibmc == 1.6.1
8-
pysolr == 3.8.1
98
python-dotenv == 0.11.0
9+
requests == 2.23.0

src/assets/css/style.css

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ a.anchor:hover {
119119

120120
.searchbar {
121121
background-color: white;
122-
border: 4px black;
122+
border: 1px solid black;
123123
}
124124

125125
/****************************

src/search/forms.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ class BasicSearchForm(forms.Form):
1919
query = forms.CharField(
2020
widget=SearchInput(
2121
attrs={
22-
"class": "uk-input uk-form-large",
22+
"class": "searchbar uk-input uk-form-large",
2323
"type": "search",
2424
"placeholder": "Search...",
2525
}
26-
)
26+
),
27+
required=False,
2728
)
2829

2930
rows = forms.IntegerField(initial=20, widget=forms.HiddenInput(), required=False,)
@@ -72,7 +73,7 @@ def clean_page(self):
7273

7374
def clean(self):
7475
cleaned_data = super().clean()
75-
query_params = cleaned_data.pop("query")
76+
query_params = cleaned_data.pop("query", {})
7677
cleaned_data.update(query_params)
7778
return cleaned_data
7879

src/search/solr.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Code for integrating with Apache Solr for search on the site
33
"""
44

5-
import pysolr
6-
5+
import logging
6+
import requests
77
from typing import Dict, List
88

99

@@ -13,10 +13,10 @@ class SearchEngine:
1313
to and query Apache Solr.
1414
"""
1515

16+
solr_logger = logging.getLogger("search.solr")
17+
1618
def __init__(self):
17-
self.solr = pysolr.Solr(
18-
"http://tec-search:8983/solr/compendium", always_commit=False
19-
)
19+
self.solr_url = "http://tec-search:8983/solr/compendium"
2020

2121
"""
2222
Search functions
@@ -31,25 +31,39 @@ def basic_search(self, query: Dict[str, List[str]]):
3131
tokens = query.get("quoted_substrings", [])
3232
tokens += query.get("words", [])
3333

34+
# If tokens == [], we assume that we didn't receive any words at all,
35+
# in which case we'll simply match everything in the compendium.
36+
if len(tokens) == 0:
37+
tokens = [""]
38+
3439
# Combine tokens into a single search query for Solr
35-
query_str = " && ".join(f"(abstract:{T!r} || title:{T!r})" for T in tokens)
40+
query_str = " && ".join(f"basic_search:*{T}*" for T in tokens)
3641

3742
# Add pagination
3843
page = query.get("page", 0)
3944
rows = query.get("rows", 20)
4045
start = page * rows
41-
kwargs = {
46+
data = {
47+
"q": query_str,
4248
"rows": rows,
4349
"start": start,
4450
"fl": "id,title,abstract,slug,year,month,day",
4551
}
4652

47-
results = self.solr.search(query_str, **kwargs)
53+
req = requests.get(f"{self.solr_url}/spell", params=data)
54+
results = req.json()
4855

49-
# Add information about the start, page number, and rows for
50-
# future handling
51-
results.page = page
52-
results.rows = rows
53-
results.start = start
56+
# Add some more useful data to the results dictionary
57+
results["meta"] = {
58+
"page": page,
59+
"rows": rows,
60+
}
61+
62+
# Do some logging to record the transaction
63+
qtime = results["responseHeader"]["QTime"]
64+
self.solr_logger.info(f"Solr query: {query_str}")
65+
self.solr_logger.debug(
66+
f"Solr query metadata: qtime={qtime}ms queryurl={req.url}"
67+
)
5468

5569
return results

src/search/views/mixins.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import abc
66
import json
7+
import logging
78

89
from django.core import serializers
910
from django.db.models import QuerySet
@@ -89,7 +90,13 @@ class BasicSearchMixin(metaclass=abc.ABCMeta):
8990
in them.
9091
"""
9192

93+
# Parent logger for search-related components
94+
search_logger = logging.getLogger("search")
95+
96+
# Name of the button used to perform search
9297
basic_search_button_name = "search"
98+
99+
# Wrapper class for querying Solr
93100
search_engine = SearchEngine()
94101

95102
def check_basic_search(self, request) -> bool:
@@ -102,23 +109,28 @@ def check_basic_search(self, request) -> bool:
102109

103110
return request.method == "GET" and self.basic_search_button_name in request.GET
104111

105-
def create_search_form(self, request):
112+
def create_search_form(self, data):
106113
"""
107114
Return a form to display a basic search input on the page.
108115
"""
109-
form = BasicSearchForm(data=request.GET)
110-
return form
116+
if isinstance(data, dict):
117+
self.search_logger.info(f"Received search params: {data}")
118+
return BasicSearchForm(data=data)
119+
else:
120+
self.search_logger.info(f"Received search params: {data.GET.dict()}")
121+
return BasicSearchForm(data=data.GET)
111122

112-
def execute_basic_search(self, params):
123+
def execute_basic_search(self, request):
113124
"""
114125
Run a basic search request. Return all compendium entries matching
115126
the input query.
116127
"""
117128

118-
form = BasicSearchForm(data=params)
129+
form = self.create_search_form(request)
119130

120131
# TODO: more robust error checking
121132
if form.is_valid():
133+
self.search_logger.debug(f"Cleaned search params: {form.cleaned_data}")
122134
results = self.search_engine.basic_search(form.cleaned_data)
123135
else:
124136
raise Exception(str(form.errors))

src/search/views/search_results.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import math
6+
import re
67

78
from django.core.paginator import Paginator
89
from django.shortcuts import render, redirect
@@ -19,50 +20,95 @@ class SearchView(BasicSearchMixin, View):
1920
default_pagination = 10
2021

2122
def get(self, request):
22-
results = self.execute_basic_search(request.GET)
23-
24-
query = request.GET.get("query", None)
23+
query = request.GET.dict()
24+
query.setdefault("query", "")
25+
self.search_logger.info(f"QUERY = {query}")
26+
results = self.execute_basic_search(query)
27+
search_form = self.create_search_form(query)
2528

2629
# Populate some CompendiumEntry objects with the data that we found
2730
# from Solr
28-
entries = [CompendiumEntry(**entry) for entry in results.docs]
31+
entries = results["response"]["docs"]
32+
entries = [CompendiumEntry(**entry) for entry in entries]
2933

30-
hits = results.hits
31-
rows = results.rows # Entries per page
32-
page = results.page
34+
hits = results["response"]["numFound"]
35+
rows = results["meta"]["rows"] # Entries per page
36+
page = results["meta"]["page"]
3337

3438
# Start/end result numbers
35-
start = results.start + 1
36-
end = results.start + len(entries)
39+
start = min(results["response"]["start"] + 1, hits)
40+
end = results["response"]["start"] + len(entries)
3741

3842
if hits == 0:
3943
n_pages = 0
4044
else:
4145
n_pages = math.ceil(hits / rows)
4246

4347
context = {
48+
"query": query,
49+
"search_form": search_form,
50+
"qtime": results["responseHeader"]["QTime"],
4451
"hits": hits,
4552
"page": page,
4653
"n_pages": n_pages,
4754
"rows": rows,
4855
"start": start,
4956
"end": end,
5057
"entries": entries,
51-
"query": query,
5258
"start": start,
5359
"rows": rows,
5460
}
55-
return render(request, "entry_list.html", context)
5661

57-
def get_old(self, request):
58-
entries = CompendiumEntry.objects.all()
62+
# Check spelling
63+
correctly_spelled, suggested_query = self._check_spelling(query, results)
5964

60-
# Paginate (don't show all of the results on a single page)
61-
paginator = Paginator(entries, self.default_pagination)
62-
page_obj = paginator.get_page(request.GET.get("page"))
63-
page_number = page_obj.number
65+
if not correctly_spelled:
66+
context["suggested_query"] = suggested_query
6467

65-
context = {
66-
"page_obj": page_obj,
67-
}
6868
return render(request, "entry_list.html", context)
69+
70+
"""
71+
Internal API
72+
"""
73+
74+
def _check_spelling(self, query: str, results: dict):
75+
"""
76+
Check the spelling of the results returned by execute_basic_search.
77+
If the query is misspelled and there aren't many (or any) results
78+
returned by the query, provide spelling suggestions.
79+
80+
Returns
81+
-------
82+
correctly_spelled : bool
83+
Whether or not the words in the query were correctly spelled.
84+
If no spelling mistakes where found, or there were sufficiently
85+
many results returned, correctly_spelled is returned as True.
86+
87+
suggested_query : Optional[str]
88+
A suggested query to replace the input query. Returns as None
89+
if no suggested query could be generated.
90+
"""
91+
92+
self.search_logger.info(results)
93+
94+
hits = results["response"]["numFound"]
95+
correctly_spelled = results.get("spellcheck", {}).get("correctlySpelled", True)
96+
correctly_spelled = correctly_spelled or hits > 10
97+
98+
if correctly_spelled:
99+
return correctly_spelled, None
100+
101+
# Create a "suggested query" by trying to fix every misspelled word
102+
# that was found.
103+
s = results.get("spellcheck", {}).get("suggestions", [])
104+
if len(s) == 0:
105+
return correctly_spelled, None
106+
107+
suggestions = [(s[ii], s[ii + 1]) for ii in range(len(s) // 2)]
108+
for (word, suggestion) in suggestions:
109+
# Find the top-suggested replacement and use regex to replace it
110+
# in the query.
111+
replacement = suggestion["suggestion"][0]["word"]
112+
query = re.sub(f"\\b{word}\\b", replacement, query, flags=re.IGNORECASE)
113+
114+
return correctly_spelled, query

src/templates/entry_list.html

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,18 @@ <h1 class="uk-h1">
1414
<hr>
1515

1616
<div class="uk-container">
17-
<p>Seeing results {{ start }} - {{ end }} out of {{ hits }}</p>
17+
<div class="uk-grid-match" uk-grid>
18+
<div class="uk-width-1-2@m">
19+
<h3>Seeing results {{ start }} - {{ end }} out of {{ hits }}</h3>
20+
</div>
21+
<div class="uk-width-1-2@m uk-text-right uk-text-small">
22+
<span class="monospace uk-text-muted">Search finished in {{ qtime }}ms</span>
23+
</div>
24+
</div>
25+
26+
<div class="uk-container uk-width-1-2@m uk-text-center">
27+
{% include "includes/searchbar.html" with form=search_form only %}
28+
</div>
1829

1930
{% for entry in entries %}
2031
{% include "includes/entry_snippet.html" with entry=entry only %}
@@ -29,6 +40,20 @@ <h1 class="uk-h1">
2940
search results.
3041
{% endcomment %}
3142

43+
{% if hits == 0 %}
44+
<div class="uk-text-center">
45+
<h3>No results were found matching the query</h3>
46+
</div>
47+
{% endif %}
48+
49+
{% if suggested_query %}
50+
<p class="uk-text-lead">
51+
Did you mean to search for
52+
<a href="?query={{ suggested_query }}&rows={{ rows }}" class="uk-text-italic">{{ suggested_query }}</a>?
53+
</p>
54+
{% endif %}
55+
56+
{% if hits > 0 %}
3257
<div>
3358
<div class="uk-grid uk-text-center uk-width-1-1 paginator">
3459
{# Display options to go to the next and previous page(s) #}
@@ -37,7 +62,7 @@ <h1 class="uk-h1">
3762
<a href="?query={{ query }}&rows={{ rows }}&page=0">
3863
<span uk-icon="icon:chevron-double-left;ratio:1.5"></span> first
3964
</a> |
40-
<a href="?query={{ query }}&rows={{ rows }}page={{ page|add:-1 }}">previous</a>
65+
<a href="?query={{ query }}&rows={{ rows }}&page={{ page|add:-1 }}">previous</a>
4166
{% endif %}
4267
</div>
4368

@@ -55,6 +80,7 @@ <h1 class="uk-h1">
5580
</div>
5681
</div>
5782
</div>
83+
{% endif %}
5884

5985
</div>
6086
{% endblock %}

0 commit comments

Comments
 (0)