Skip to content

Commit b1178ac

Browse files
committed
add mdx
1 parent 86c424b commit b1178ac

File tree

145 files changed

+3588
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

145 files changed

+3588
-2
lines changed

.github/actions/make_mdx.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ mv $f $f.mdx
4242
done
4343

4444
for f in *.sh
45-
do sed -i '1i```shell\' $f
45+
do sed -i '1i```bash\' $f
4646
sed -i '$ a ```' $f
4747
mv $f $f.mdx
4848
done
@@ -57,7 +57,7 @@ mv $f $f.mdx
5757
done
5858

5959
for f in *.sh
60-
do sed -i '1i```shell\' $f
60+
do sed -i '1i```bash\' $f
6161
sed -i '$ a ```' $f
6262
mv $f $f.mdx
6363
done
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.astra import (
5+
AstraAccessConfig,
6+
AstraWriteConfig,
7+
SimpleAstraConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.astra import (
19+
AstraWriter,
20+
)
21+
from unstructured.ingest.runner.writers.base_writer import Writer
22+
23+
24+
def get_writer() -> Writer:
25+
return AstraWriter(
26+
connector_config=SimpleAstraConfig(
27+
access_config=AstraAccessConfig(
28+
token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT")
29+
),
30+
collection_name="test_collection",
31+
embedding_dimension=384,
32+
),
33+
write_config=AstraWriteConfig(batch_size=80),
34+
)
35+
36+
37+
if __name__ == "__main__":
38+
writer = get_writer()
39+
runner = LocalRunner(
40+
processor_config=ProcessorConfig(
41+
verbose=True,
42+
output_dir="local-output-to-astra",
43+
num_processes=2,
44+
),
45+
connector_config=SimpleLocalConfig(
46+
input_path="example-docs/book-war-and-peace-1p.txt",
47+
),
48+
read_config=ReadConfig(),
49+
partition_config=PartitionConfig(),
50+
chunking_config=ChunkingConfig(chunk_elements=True),
51+
embedding_config=EmbeddingConfig(
52+
provider="langchain-huggingface",
53+
),
54+
writer=writer,
55+
writer_kwargs={},
56+
)
57+
runner.run()
58+
```
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1p.txt \
9+
--output-dir local-output-to-astra \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
astra \
16+
--token "$ASTRA_DB_TOKEN" \
17+
--api-endpoint "$ASTRA_DB_ENDPOINT" \
18+
--collection-name "$COLLECTION_NAME" \
19+
--embedding-dimension "$EMBEDDING_DIMENSION"
20+
```
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
```python
2+
from unstructured.ingest.connector.fsspec.azure import (
3+
AzureAccessConfig,
4+
AzureWriteConfig,
5+
SimpleAzureBlobStorageConfig,
6+
)
7+
from unstructured.ingest.connector.local import SimpleLocalConfig
8+
from unstructured.ingest.interfaces import (
9+
ChunkingConfig,
10+
EmbeddingConfig,
11+
PartitionConfig,
12+
ProcessorConfig,
13+
ReadConfig,
14+
)
15+
from unstructured.ingest.runner import LocalRunner
16+
from unstructured.ingest.runner.writers.base_writer import Writer
17+
from unstructured.ingest.runner.writers.fsspec.azure import (
18+
AzureWriter,
19+
)
20+
21+
22+
def get_writer() -> Writer:
23+
return AzureWriter(
24+
connector_config=SimpleAzureBlobStorageConfig(
25+
access_config=AzureAccessConfig(account_name="azureunstructured1"),
26+
remote_url="az://unstructured/war-and-peace-output",
27+
),
28+
write_config=AzureWriteConfig(),
29+
)
30+
31+
32+
if __name__ == "__main__":
33+
writer = get_writer()
34+
runner = LocalRunner(
35+
processor_config=ProcessorConfig(
36+
verbose=True,
37+
output_dir="local-output-to-azure",
38+
num_processes=2,
39+
),
40+
connector_config=SimpleLocalConfig(
41+
input_path="example-docs/book-war-and-peace-1225p.txt",
42+
),
43+
read_config=ReadConfig(),
44+
partition_config=PartitionConfig(),
45+
chunking_config=ChunkingConfig(chunk_elements=True),
46+
embedding_config=EmbeddingConfig(
47+
provider="langchain-huggingface",
48+
),
49+
writer=writer,
50+
writer_kwargs={},
51+
)
52+
runner.run()
53+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1225p.txt \
9+
--output-dir local-output-to-azure \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
azure \
16+
--account-name azureunstructured1 \
17+
--remote-url "<your destination path here, ie 'az://unstructured/war-and-peace-output'>"
18+
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.azure_cognitive_search import (
5+
AzureCognitiveSearchAccessConfig,
6+
AzureCognitiveSearchWriteConfig,
7+
SimpleAzureCognitiveSearchStorageConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.azure_cognitive_search import (
19+
AzureCognitiveSearchWriter,
20+
)
21+
from unstructured.ingest.runner.writers.base_writer import Writer
22+
23+
24+
def get_writer() -> Writer:
25+
return AzureCognitiveSearchWriter(
26+
connector_config=SimpleAzureCognitiveSearchStorageConfig(
27+
access_config=AzureCognitiveSearchAccessConfig(key=os.getenv("AZURE_SEARCH_API_KEY")),
28+
endpoint=os.getenv("$AZURE_SEARCH_ENDPOINT"),
29+
),
30+
write_config=AzureCognitiveSearchWriteConfig(index="utic-test-ingest-fixtures-output"),
31+
)
32+
33+
34+
if __name__ == "__main__":
35+
writer = get_writer()
36+
runner = LocalRunner(
37+
processor_config=ProcessorConfig(
38+
verbose=True,
39+
output_dir="local-output-to-azure-cog-search",
40+
num_processes=2,
41+
),
42+
connector_config=SimpleLocalConfig(
43+
input_path="example-docs/book-war-and-peace-1225p.txt",
44+
),
45+
read_config=ReadConfig(),
46+
partition_config=PartitionConfig(),
47+
chunking_config=ChunkingConfig(chunk_elements=True),
48+
embedding_config=EmbeddingConfig(
49+
provider="langchain-huggingface",
50+
),
51+
writer=writer,
52+
writer_kwargs={},
53+
)
54+
runner.run()
55+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
4+
5+
unstructured-ingest \
6+
local \
7+
--input-path example-docs/book-war-and-peace-1225p.txt \
8+
--output-dir local-output-to-azure-cog-search \
9+
--strategy fast \
10+
--chunk-elements \
11+
--embedding-provider "$EMBEDDING_PROVIDER" \
12+
--num-processes 2 \
13+
--verbose \
14+
azure-cognitive-search \
15+
--key "$AZURE_SEARCH_API_KEY" \
16+
--endpoint "$AZURE_SEARCH_ENDPOINT" \
17+
--index utic-test-ingest-fixtures-output
18+
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.fsspec.box import (
5+
BoxAccessConfig,
6+
BoxWriteConfig,
7+
SimpleBoxConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.base_writer import Writer
19+
from unstructured.ingest.runner.writers.fsspec.box import (
20+
BoxWriter,
21+
)
22+
23+
24+
def get_writer() -> Writer:
25+
return BoxWriter(
26+
connector_config=SimpleBoxConfig(
27+
access_config=BoxAccessConfig(box_app_config=os.getenv("BOX_APP_CONFIG_PATH")),
28+
remote_url="box://unstructured/war-and-peace-output",
29+
),
30+
write_config=BoxWriteConfig(),
31+
)
32+
33+
34+
if __name__ == "__main__":
35+
writer = get_writer()
36+
runner = LocalRunner(
37+
processor_config=ProcessorConfig(
38+
verbose=True,
39+
output_dir="local-output-to-box",
40+
num_processes=2,
41+
),
42+
connector_config=SimpleLocalConfig(
43+
input_path="example-docs/book-war-and-peace-1225p.txt",
44+
),
45+
read_config=ReadConfig(),
46+
partition_config=PartitionConfig(),
47+
chunking_config=ChunkingConfig(chunk_elements=True),
48+
embedding_config=EmbeddingConfig(
49+
provider="langchain-huggingface",
50+
),
51+
writer=writer,
52+
writer_kwargs={},
53+
)
54+
runner.run()
55+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1225p.txt \
9+
--output-dir local-output-to-box \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
box \
16+
--box_app_config "$BOX_APP_CONFIG_PATH" \
17+
--remote-url "<your destination path here, ie 'box://unstructured/war-and-peace-output'>"
18+
```
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
```python
2+
from unstructured.ingest.connector.chroma import (
3+
ChromaAccessConfig,
4+
ChromaWriteConfig,
5+
SimpleChromaConfig,
6+
)
7+
from unstructured.ingest.connector.local import SimpleLocalConfig
8+
from unstructured.ingest.interfaces import (
9+
ChunkingConfig,
10+
EmbeddingConfig,
11+
PartitionConfig,
12+
ProcessorConfig,
13+
ReadConfig,
14+
)
15+
from unstructured.ingest.runner import LocalRunner
16+
from unstructured.ingest.runner.writers.base_writer import Writer
17+
from unstructured.ingest.runner.writers.chroma import (
18+
ChromaWriter,
19+
)
20+
21+
22+
def get_writer() -> Writer:
23+
return ChromaWriter(
24+
connector_config=SimpleChromaConfig(
25+
access_config=ChromaAccessConfig(),
26+
host="localhost",
27+
port=8000,
28+
collection_name="elements",
29+
tenant="default_tenant",
30+
database="default_database",
31+
),
32+
write_config=ChromaWriteConfig(),
33+
)
34+
35+
36+
if __name__ == "__main__":
37+
writer = get_writer()
38+
runner = LocalRunner(
39+
processor_config=ProcessorConfig(
40+
verbose=True,
41+
output_dir="local-output-to-chroma",
42+
num_processes=2,
43+
),
44+
connector_config=SimpleLocalConfig(
45+
input_path="example-docs/book-war-and-peace-1225p.txt",
46+
),
47+
read_config=ReadConfig(),
48+
partition_config=PartitionConfig(),
49+
chunking_config=ChunkingConfig(chunk_elements=True),
50+
embedding_config=EmbeddingConfig(
51+
provider="langchain-huggingface",
52+
),
53+
writer=writer,
54+
writer_kwargs={},
55+
)
56+
runner.run()
57+
```

0 commit comments

Comments
 (0)