From b1faa0907d32417b5e93af97fd17f07e1a9c9a97 Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 09:58:06 -0500 Subject: [PATCH 1/7] chore(bakcend): add command to apply migrations --- apps/backend/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/backend/package.json b/apps/backend/package.json index d9d06a8c43..c094660239 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -5,7 +5,8 @@ "scripts": { "start": "yarn supabase start", "stop": "yarn supabase stop", - "generate:db-types": "yarn supabase gen types typescript --local > supabase/schema.gen.ts" + "generate:db-types": "yarn supabase gen types typescript --local > supabase/schema.gen.ts", + "db:reset": "yarn supabase db reset" }, "devDependencies": { "supabase": "^1.136.3" From 107c7c332ee4bbb8d7e8cb10dd59fc154931f8ee Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 11:25:55 -0500 Subject: [PATCH 2/7] chore(eng-docs): wip for enmbeddings --- .../engineering/doc-site/docsearch.md | 2 +- .../doc-site/generating-embeddings.md | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 internal-docs/engineering/doc-site/generating-embeddings.md diff --git a/internal-docs/engineering/doc-site/docsearch.md b/internal-docs/engineering/doc-site/docsearch.md index 689bab66e6..e9bb890ef7 100644 --- a/internal-docs/engineering/doc-site/docsearch.md +++ b/internal-docs/engineering/doc-site/docsearch.md @@ -16,7 +16,7 @@ The same happens with Github discussions. The discussion is a page and is split We can then perform a similarity search using the user input as the query. -Embeddings are generated using the the [embedding script](../../../packages/paste-website/scripts/search/). +Embeddings are generated using the the [embedding script](../../../packages/paste-website/scripts/search/). For a more detailed explanation of generating embeddings and running locally refernce [generating-embeddings](./generating-embeddings.md). ## Production vs Previews (staging) diff --git a/internal-docs/engineering/doc-site/generating-embeddings.md b/internal-docs/engineering/doc-site/generating-embeddings.md new file mode 100644 index 0000000000..36d0a49421 --- /dev/null +++ b/internal-docs/engineering/doc-site/generating-embeddings.md @@ -0,0 +1,34 @@ +# Generating Embeddings + +Embeddings are what is used for our [Doc Search](./docsearch.md) functionality. An OpenAI embeddings are a technique that uses machine learning and big data to convert unstructured data into structured vector spaces. + +In our use case it converts plain text such as "How do I use button variants?". It uses the model `text-embedding-ada-002` and outputs a similar structure to: `[-0.005330325,0.018767769,0.00020701668,-0.0011101937, ...]` + +## Local Development + +In order to develop locally you will need to startup a local instance of Supabase. The code for this is found in /apps/backend. Follow [this](../../../apps/backend/README.md) document to get setup. + +After you have it setup you should be able to access Supabase at: http://127.0.0.1:54323. If you have no tables you have not applied migrations. you can run ```yarn workspace @twilio-paste/backend db:reset``` from the root of the project. + +**Note:** /if you see an error for vector packages go into [20230928013336_initial_schema](../../../apps/backend/supabase/migrations/20230928013336_initial_schema.sql) and change the following without committing: + +```sql +create extension if not exists "vector" with schema "public" version '0.5.0'; +/* to */ +create extension if not exists "vector" with schema "public"; +``` + +### Environment Variables + +In order to do any GH action or assistant development on the site you will need to set environment variables in ```packages/paste-website/.env```. + +``` +OPENAI_API_KEY="" // USE YOUR PERSONALTOKEN FOR LOCAL DEV +SUPABASE_URL="http://127.0.0.1:54321" // PRINTED TO CONSOLE AFTER STARTING CONTAINER +SUPABASE_KEY="" // PRINTED TO CONSOLE AFTER STARTING CONTAINER +GH_SERVICE_ACC_DISCUSSIONS_TOKEN="" // IN 1Password UNDER github.com ENTRY +``` + +### Generating Data + +The best way to geenrate data is to run the nightly embed script `generate:embeddings`. This will update the tables: `page` and `page_section`. \ No newline at end of file From face76930a95035da59d4c41708ab98d072bc888 Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 11:30:50 -0500 Subject: [PATCH 3/7] chore(end): added in new CHANGELOG.md to create:package --- plopfile.js | 5 +++++ tools/plop-templates/CHANGELOG.hbs | 0 2 files changed, 5 insertions(+) create mode 100644 tools/plop-templates/CHANGELOG.hbs diff --git a/plopfile.js b/plopfile.js index dce12aab7d..40b965666c 100644 --- a/plopfile.js +++ b/plopfile.js @@ -81,6 +81,11 @@ module.exports = function (plop) { path: "packages/paste-core/{{component-type}}/{{kebabCase component-name}}/tsconfig.json", templateFile: "tools/plop-templates/tsconfig.hbs", }, + { + type: "add", + path: "packages/paste-core/{{component-type}}/{{kebabCase component-name}}/CHANGELOG.md", + templateFile: "tools/plop-templates/CHANGELOG.hbs", + }, ], }); }; diff --git a/tools/plop-templates/CHANGELOG.hbs b/tools/plop-templates/CHANGELOG.hbs new file mode 100644 index 0000000000..e69de29bb2 From c9a118989f4b838132524114421b872b2a9467a9 Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 13:23:56 -0500 Subject: [PATCH 4/7] chore(eng-docs): update embeddings doc --- .../doc-site/generating-embeddings.md | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/internal-docs/engineering/doc-site/generating-embeddings.md b/internal-docs/engineering/doc-site/generating-embeddings.md index 36d0a49421..86adadb0de 100644 --- a/internal-docs/engineering/doc-site/generating-embeddings.md +++ b/internal-docs/engineering/doc-site/generating-embeddings.md @@ -1,16 +1,16 @@ # Generating Embeddings -Embeddings are what is used for our [Doc Search](./docsearch.md) functionality. An OpenAI embeddings are a technique that uses machine learning and big data to convert unstructured data into structured vector spaces. +Embeddings are what is used for our [Doc Search](./docsearch.md) functionality. OpenAI embeddings are a technique that uses machine learning and big data to convert unstructured data into structured vector spaces. -In our use case it converts plain text such as "How do I use button variants?". It uses the model `text-embedding-ada-002` and outputs a similar structure to: `[-0.005330325,0.018767769,0.00020701668,-0.0011101937, ...]` +In our use case it converts plain text such as search criteria, mdx headers and GitHub discussion titles. It uses the model `text-embedding-ada-002` and outputs a similar structure to: `[-0.005330325,0.018767769,0.00020701668,-0.0011101937, ...]` ## Local Development -In order to develop locally you will need to startup a local instance of Supabase. The code for this is found in /apps/backend. Follow [this](../../../apps/backend/README.md) document to get setup. +In order to develop locally you will need to startup a local instance of Supabase. The code for this is found in `/apps/backend`. Follow [this](../../../apps/backend/README.md) document to get setup. -After you have it setup you should be able to access Supabase at: http://127.0.0.1:54323. If you have no tables you have not applied migrations. you can run ```yarn workspace @twilio-paste/backend db:reset``` from the root of the project. +After you have it setup you should be able to access Supabase at: http://127.0.0.1:54323. If you have no tables, you have not applied migrations. You can run ```yarn workspace @twilio-paste/backend db:reset``` from the root of the project. -**Note:** /if you see an error for vector packages go into [20230928013336_initial_schema](../../../apps/backend/supabase/migrations/20230928013336_initial_schema.sql) and change the following without committing: +**Note**: if you see an error for vector packages go into [20230928013336_initial_schema](../../../apps/backend/supabase/migrations/20230928013336_initial_schema.sql) and change the following without committing: ```sql create extension if not exists "vector" with schema "public" version '0.5.0'; @@ -31,4 +31,12 @@ GH_SERVICE_ACC_DISCUSSIONS_TOKEN="" // IN 1Password UNDER github.com ENTRY ### Generating Data -The best way to geenrate data is to run the nightly embed script `generate:embeddings`. This will update the tables: `page` and `page_section`. \ No newline at end of file +The best way to generate data is to run the nightly embed script `generate:embeddings`. This will update the tables: `page` and `page_section`. + +## Table Structure + +While there are other tables the only ones that concern the embeddings creation are: +- **page**: Stores the metadata of the entry. Key columns are the checksum (used to determine whether to update the record), path (either the url of the page or the github discussion), type (github-discussion or markdown) +- **page_sections**: contains the search embeddings. Key columns are content (plain text headings/titles), embedding (the vector spaces created from OpenAI), slug (yo string of content or the discussion/answer in GitHub). + +Both tables are related with page being thge parent. They are joined by `page.id on page_section.page_id`. \ No newline at end of file From 2fd59ae57303619a7dfae8c4498225b9155da402 Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 13:24:45 -0500 Subject: [PATCH 5/7] chore(eng-docs): md highlights --- internal-docs/engineering/doc-site/generating-embeddings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal-docs/engineering/doc-site/generating-embeddings.md b/internal-docs/engineering/doc-site/generating-embeddings.md index 86adadb0de..a6c7802ce6 100644 --- a/internal-docs/engineering/doc-site/generating-embeddings.md +++ b/internal-docs/engineering/doc-site/generating-embeddings.md @@ -10,7 +10,7 @@ In order to develop locally you will need to startup a local instance of Supabas After you have it setup you should be able to access Supabase at: http://127.0.0.1:54323. If you have no tables, you have not applied migrations. You can run ```yarn workspace @twilio-paste/backend db:reset``` from the root of the project. -**Note**: if you see an error for vector packages go into [20230928013336_initial_schema](../../../apps/backend/supabase/migrations/20230928013336_initial_schema.sql) and change the following without committing: +**Note**: if you see an error for vector packages go into [20230928013336_initial_schema](../../../apps/backend/supabase/migrations/20230928013336_initial_schema.sql) and change the following **without committing**: ```sql create extension if not exists "vector" with schema "public" version '0.5.0'; From 495a519476154bec447861739f880043117f6d36 Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 13:45:15 -0500 Subject: [PATCH 6/7] chore(eng-docs): document NPM publish process --- internal-docs/engineering/publishing-npm-package.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 internal-docs/engineering/publishing-npm-package.md diff --git a/internal-docs/engineering/publishing-npm-package.md b/internal-docs/engineering/publishing-npm-package.md new file mode 100644 index 0000000000..93dafe9f34 --- /dev/null +++ b/internal-docs/engineering/publishing-npm-package.md @@ -0,0 +1,13 @@ +# Publishing NPM Package + +Paste core uses [changesets](https://github.com/changesets/changesets) to manage versions and changelogs. It has great support for mono-repos and multipackage repositories, ideal for `@twilio-paste/core`. + +Changesets has a great [GitHub action](https://github.com/changesets/action) that will manage the release by creating a PR, periodically pulling changes from main. No code is published to NPM until this PR is merged, which is controlled by the team. + +The PR will always be called `Version Packages` and lists all the changes that have been made since the last release. The description will also update with the entries in the changesets from the PRs merged to easily see what will be getting released. + +There is a step in the GitHub Action [on_merge_to_main](../../.github/workflows/on_merge_to_main.yml) with the name `Create Pull Request or Publish to npm`. This defines commands to run from [package.json](../../package.json) for what operation. + +- version: this removes all of the temporary changeset files which are generated during development. It aggregates them all to a changelog entry. +- publish: responsible for publishing the package to NPM. +- commit: "chore(release): version packages" the commit message on squash and merge. \ No newline at end of file From a725b3bd090f990946bef1f9cd495643009386bc Mon Sep 17 00:00:00 2001 From: Kristian Antrobus Date: Fri, 5 Jul 2024 15:42:37 -0500 Subject: [PATCH 7/7] chore(eng-docs): typo fixes --- internal-docs/engineering/doc-site/docsearch.md | 2 +- internal-docs/engineering/doc-site/generating-embeddings.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal-docs/engineering/doc-site/docsearch.md b/internal-docs/engineering/doc-site/docsearch.md index e9bb890ef7..a9967dde39 100644 --- a/internal-docs/engineering/doc-site/docsearch.md +++ b/internal-docs/engineering/doc-site/docsearch.md @@ -16,7 +16,7 @@ The same happens with Github discussions. The discussion is a page and is split We can then perform a similarity search using the user input as the query. -Embeddings are generated using the the [embedding script](../../../packages/paste-website/scripts/search/). For a more detailed explanation of generating embeddings and running locally refernce [generating-embeddings](./generating-embeddings.md). +Embeddings are generated using the the [embedding script](../../../packages/paste-website/scripts/search/). For a more detailed explanation of generating embeddings and running locally reference [generating-embeddings](./generating-embeddings.md). ## Production vs Previews (staging) diff --git a/internal-docs/engineering/doc-site/generating-embeddings.md b/internal-docs/engineering/doc-site/generating-embeddings.md index a6c7802ce6..4393487a7c 100644 --- a/internal-docs/engineering/doc-site/generating-embeddings.md +++ b/internal-docs/engineering/doc-site/generating-embeddings.md @@ -37,6 +37,6 @@ The best way to generate data is to run the nightly embed script `generate:embed While there are other tables the only ones that concern the embeddings creation are: - **page**: Stores the metadata of the entry. Key columns are the checksum (used to determine whether to update the record), path (either the url of the page or the github discussion), type (github-discussion or markdown) -- **page_sections**: contains the search embeddings. Key columns are content (plain text headings/titles), embedding (the vector spaces created from OpenAI), slug (yo string of content or the discussion/answer in GitHub). +- **page_sections**: contains the search embeddings. Key columns are content (plain text headings/titles), embedding (the vector spaces created from OpenAI), slug (toString of content or the discussion/answer in GitHub). -Both tables are related with page being thge parent. They are joined by `page.id on page_section.page_id`. \ No newline at end of file +Both tables are related with page being the parent. They are joined by `page.id on page_section.page_id`. \ No newline at end of file