Skip to content

Get reformulator working #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: public
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 21 additions & 33 deletions reformulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,12 +269,12 @@ def reformulate(self):
if self.exclude_version:
query += f" -AnkiReformulator:\"*version*=*'{self.VERSION}'*\""

# load db just in case
# load db just in case, and create one if it doesn't already exist
self.db_content = self.load_db()
if not self.db_content:
red("Empty database. If you have already ran anki_reformulator "
"before then something went wrong!")
whi("Trying to create a new database")
whi("Creating a empty database")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo, should be "an"

self.save_to_db({})
self.db_content = self.load_db()
assert self.db_content, "Could not create database"
Expand Down Expand Up @@ -507,7 +507,7 @@ def reformulate_note(self, nid: int, note: pd.Series) -> Dict:
# reformulate the content
content = note["fields"][self.field_name]["value"]
log["note_field_content"] = content
formattedcontent = self.cloze_input_parser(content) if iscloze(content) else content
formattedcontent = self.cloze_input_parser(content)
log["note_field_formattedcontent"] = formattedcontent

# if the card is in the dataset, just take the dataset value directly
Expand Down Expand Up @@ -537,11 +537,13 @@ def reformulate_note(self, nid: int, note: pd.Series) -> Dict:
fc, media = replace_media(
content=formattedcontent,
media=None,
mode="remove_media",
)
mode="remove_media")
log["media"] = media

if not skip_llm:
if skip_llm:
log["llm_answer"] = {"Skipped": True}
log["dollar_price"] = 0
else:
dataset = copy.deepcopy(self.dataset)
curr_mess = [{"role": "user", "content": fc}]
dataset = semantic_prompt_filtering(
Expand All @@ -553,8 +555,7 @@ def reformulate_note(self, nid: int, note: pd.Series) -> Dict:
embedding_model=self.embedding_model,
whi=whi,
yel=yel,
red=red,
)
red=red)
dataset += curr_mess

assert dataset[0]["role"] == "system", "First message is not from system!"
Expand Down Expand Up @@ -597,20 +598,14 @@ def reformulate_note(self, nid: int, note: pd.Series) -> Dict:
)
else:
log["dollar_price"] = "?"
else:
log["llm_answer"] = {"Skipped": True}
log["dollar_price"] = 0

log["note_field_newcontent"] = newcontent
formattednewcontent = self.cloze_output_parser(newcontent) if iscloze(newcontent) else newcontent
formattednewcontent = self.cloze_output_parser(newcontent)
log["note_field_formattednewcontent"] = formattednewcontent
log["status"] = STAT_OK_REFORM

if iscloze(content) and iscloze( newcontent + formattednewcontent):
# check that no cloze were lost
# TODO: Bug here: `iscloze` can return true if the new content is a
# close, but if the original content is not a cloze, then this
# fails
for cl in getclozes(content):
cl = cl.split("::")[0] + "::"
assert cl.startswith("{{c") and cl in content
Expand Down Expand Up @@ -734,18 +729,14 @@ def reset_note(self, nid: int, note: pd.Series) -> Dict:
]

if not entries:
red(
f"Entry not found for note {nid}. Looking for the content of "
"the field AnkiReformulator"
)
red(f"Entry not found for note {nid}. Looking for the content of "
"the field AnkiReformulator")
logfield = note["fields"]["AnkiReformulator"]["value"]
logfield = logfield.split(
"<!--SEPARATOR-->")[0] # keep most recent
if not logfield.strip():
raise Exception(
f"Note {nid} was not found in the db and its "
"AnkiReformulator field was empty."
)
raise Exception(f"Note {nid} was not found in the db and its "
"AnkiReformulator field was empty.")

# replace the [[c1::cloze]] by {{c1::cloze}}
logfield = logfield.replace("]]", "}}")
Expand All @@ -755,7 +746,7 @@ def reset_note(self, nid: int, note: pd.Series) -> Dict:

# parse old content
buffer = []
for i, line in enumerate(logfield.split("<br>")):
for line in logfield.split("<br>"):
if buffer:
try:
_ = rtoml.loads("".join(buffer + [line]))
Expand All @@ -774,10 +765,12 @@ def reset_note(self, nid: int, note: pd.Series) -> Dict:

# parse new content at the time
buffer = []
for i, line in enumerate(logfield.split("<br>")):
for line in logfield.split("<br>"):
if buffer:
try:
_ = rtoml.loads("".join(buffer + [line]))
# TODO: What are you trying to do here? Just check that adding the line keeps valid toml?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so. Irrc the thing with toml was to have a human readable way to see what happened using the addon. As a lot of log is packed into it I also added code to try rolling back if --reset was used and in case the db failed to recover. I can be fine with only storing data to the db, but also with storing all metadata of all scripts into a single field.

Btw rtoml was better in some aspects than toml but I remember a bit having much trouble in some situation (lile dumping then loading resulting in different values especially when None are involved but can't remember more specifically.)

# If so, you should catch the specific exception that the load function raises on error
rtoml.loads("".join(buffer + [line]))
buffer.append(line)
continue
except Exception:
Expand Down Expand Up @@ -931,10 +924,8 @@ def apply_reset(self, log: Dict) -> None:

# remove TO_RESET tag if present
removetags(nid, "AnkiReformulator::TO_RESET")

# remove Done tag
removetags(nid, "AnkiReformulator::Done")

# remove DOING tag
removetags(nid, "AnkiReformulator::RESETTING")

Expand Down Expand Up @@ -987,11 +978,8 @@ def load_db(self) -> Dict:
cursor = conn.cursor()
cursor.execute("SELECT data FROM dictionaries")
rows = cursor.fetchall()
dictionaries = []
for row in rows:
dictionary = json.loads(zlib.decompress(row[0]))
dictionaries.append(dictionary)
return dictionaries
# TODO: Why do you compress? This just makes it more difficult to debug
Copy link
Owner

@thiswillbeyourgithub thiswillbeyourgithub Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intially I just dumped json but the size got out of hand surprisingly quickly so I compressed it with zlib and found out I might as well use sqlite.

This was totally amateurish, if I had to do it again I would use sqlite only and enable the built in compressions of course. But still being technically an amateur I'm open to any suggestion of course

return [json.loads(zlib.decompress(row[0])) for row in rows]


if __name__ == "__main__":
Expand Down
9 changes: 5 additions & 4 deletions utils/cloze_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ def getclozes(text: str) -> List[str]:


def cloze_input_parser(cloze: str) -> str:
"""edits the cloze from anki before sending it to the LLM. This is useful
if you use weird formatting that mess with LLMs"""
assert iscloze(cloze), f"Invalid cloze: {cloze}"
"""edit the cloze from anki before sending it to the LLM. This is useful
if you use weird formatting that mess with LLMs.
If the note content is not a cloze, then return it unmodified."""
if not iscloze(cloze):
return cloze

# TODO: What is this?
cloze = cloze.replace("\xa0", " ")

# make newlines consistent
Expand Down