Skip to content

Commit d0db16a

Browse files
Merge pull request #4 from Unstructured-IO/potter/create_mdx_script
add script to convert py and sh to mdx
2 parents ed8a2eb + ae24870 commit d0db16a

File tree

145 files changed

+3645
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

145 files changed

+3645
-0
lines changed

scripts/make_mdx.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env bash
2+
3+
# This script downloads the .py and .sh code from the open source repo
4+
# Then it converts the code to markdown files
5+
6+
# Requres two ABSOLUTE paths to the final destination and source directories
7+
# ex: ./make_mdx.sh /Users/potter/dest /Users/potter/src
8+
9+
# make temp directory
10+
WORK_DIR=$(mktemp -d)
11+
12+
PY_DEST_REPO="docs/source/ingest/destination_connectors/code/python/"
13+
SH_DEST_REPO="docs/source/ingest/destination_connectors/code/bash/"
14+
DEST_TARGET_DIR=$1 # first argument to script
15+
PY_SRC_REPO="docs/source/ingest/source_connectors/code/python/"
16+
SH_SRC_REPO="docs/source/ingest/source_connectors/code/bash/"
17+
SRC_TARGET_DIR=$2 # second argument to script
18+
19+
# Clone the correct directories in the open source repo
20+
cd "$WORK_DIR"
21+
git init
22+
git remote add -f origin https://github.com/Unstructured-IO/unstructured
23+
git config core.sparseCheckout true
24+
echo "$PY_DEST_REPO" >> .git/info/sparse-checkout
25+
echo "$SH_DEST_REPO" >> .git/info/sparse-checkout
26+
echo "$PY_SRC_REPO" >> .git/info/sparse-checkout
27+
echo "$SH_SRC_REPO" >> .git/info/sparse-checkout
28+
git pull origin main
29+
30+
cp -R "$WORK_DIR/$PY_DEST_REPO/." "$DEST_TARGET_DIR/"
31+
cp -R "$WORK_DIR/$SH_DEST_REPO/." "$DEST_TARGET_DIR/"
32+
cp -R "$WORK_DIR/$PY_SRC_REPO/." "$SRC_TARGET_DIR/"
33+
cp -R "$WORK_DIR/$SH_SRC_REPO/." "$SRC_TARGET_DIR/"
34+
35+
function to_mdx() {
36+
for f in *.py
37+
do sed -i '1i```python\' $f
38+
sed -i '$ a ```' $f
39+
mv $f $f.mdx
40+
done
41+
42+
for f in *.sh
43+
do sed -i '1i```bash\' $f
44+
sed -i '$ a ```' $f
45+
mv $f $f.mdx
46+
done
47+
}
48+
49+
# Convert the destination_connectors to markdown
50+
cd "$DEST_TARGET_DIR"
51+
to_mdx
52+
53+
# Convert the source_connectors to markdown
54+
cd "$SRC_TARGET_DIR"
55+
to_mdx
56+
57+
rm -rf "$WORK_DIR"
58+
59+
echo "Markdown files created in $DEST_TARGET_DIR and $SRC_TARGET_DIR"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.astra import (
5+
AstraAccessConfig,
6+
AstraWriteConfig,
7+
SimpleAstraConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.astra import (
19+
AstraWriter,
20+
)
21+
from unstructured.ingest.runner.writers.base_writer import Writer
22+
23+
24+
def get_writer() -> Writer:
25+
return AstraWriter(
26+
connector_config=SimpleAstraConfig(
27+
access_config=AstraAccessConfig(
28+
token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT")
29+
),
30+
collection_name="test_collection",
31+
embedding_dimension=384,
32+
),
33+
write_config=AstraWriteConfig(batch_size=80),
34+
)
35+
36+
37+
if __name__ == "__main__":
38+
writer = get_writer()
39+
runner = LocalRunner(
40+
processor_config=ProcessorConfig(
41+
verbose=True,
42+
output_dir="local-output-to-astra",
43+
num_processes=2,
44+
),
45+
connector_config=SimpleLocalConfig(
46+
input_path="example-docs/book-war-and-peace-1p.txt",
47+
),
48+
read_config=ReadConfig(),
49+
partition_config=PartitionConfig(),
50+
chunking_config=ChunkingConfig(chunk_elements=True),
51+
embedding_config=EmbeddingConfig(
52+
provider="langchain-huggingface",
53+
),
54+
writer=writer,
55+
writer_kwargs={},
56+
)
57+
runner.run()
58+
```
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1p.txt \
9+
--output-dir local-output-to-astra \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
astra \
16+
--token "$ASTRA_DB_TOKEN" \
17+
--api-endpoint "$ASTRA_DB_ENDPOINT" \
18+
--collection-name "$COLLECTION_NAME" \
19+
--embedding-dimension "$EMBEDDING_DIMENSION"
20+
```
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
```python
2+
from unstructured.ingest.connector.fsspec.azure import (
3+
AzureAccessConfig,
4+
AzureWriteConfig,
5+
SimpleAzureBlobStorageConfig,
6+
)
7+
from unstructured.ingest.connector.local import SimpleLocalConfig
8+
from unstructured.ingest.interfaces import (
9+
ChunkingConfig,
10+
EmbeddingConfig,
11+
PartitionConfig,
12+
ProcessorConfig,
13+
ReadConfig,
14+
)
15+
from unstructured.ingest.runner import LocalRunner
16+
from unstructured.ingest.runner.writers.base_writer import Writer
17+
from unstructured.ingest.runner.writers.fsspec.azure import (
18+
AzureWriter,
19+
)
20+
21+
22+
def get_writer() -> Writer:
23+
return AzureWriter(
24+
connector_config=SimpleAzureBlobStorageConfig(
25+
access_config=AzureAccessConfig(account_name="azureunstructured1"),
26+
remote_url="az://unstructured/war-and-peace-output",
27+
),
28+
write_config=AzureWriteConfig(),
29+
)
30+
31+
32+
if __name__ == "__main__":
33+
writer = get_writer()
34+
runner = LocalRunner(
35+
processor_config=ProcessorConfig(
36+
verbose=True,
37+
output_dir="local-output-to-azure",
38+
num_processes=2,
39+
),
40+
connector_config=SimpleLocalConfig(
41+
input_path="example-docs/book-war-and-peace-1225p.txt",
42+
),
43+
read_config=ReadConfig(),
44+
partition_config=PartitionConfig(),
45+
chunking_config=ChunkingConfig(chunk_elements=True),
46+
embedding_config=EmbeddingConfig(
47+
provider="langchain-huggingface",
48+
),
49+
writer=writer,
50+
writer_kwargs={},
51+
)
52+
runner.run()
53+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1225p.txt \
9+
--output-dir local-output-to-azure \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
azure \
16+
--account-name azureunstructured1 \
17+
--remote-url "<your destination path here, ie 'az://unstructured/war-and-peace-output'>"
18+
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.azure_cognitive_search import (
5+
AzureCognitiveSearchAccessConfig,
6+
AzureCognitiveSearchWriteConfig,
7+
SimpleAzureCognitiveSearchStorageConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.azure_cognitive_search import (
19+
AzureCognitiveSearchWriter,
20+
)
21+
from unstructured.ingest.runner.writers.base_writer import Writer
22+
23+
24+
def get_writer() -> Writer:
25+
return AzureCognitiveSearchWriter(
26+
connector_config=SimpleAzureCognitiveSearchStorageConfig(
27+
access_config=AzureCognitiveSearchAccessConfig(key=os.getenv("AZURE_SEARCH_API_KEY")),
28+
endpoint=os.getenv("$AZURE_SEARCH_ENDPOINT"),
29+
),
30+
write_config=AzureCognitiveSearchWriteConfig(index="utic-test-ingest-fixtures-output"),
31+
)
32+
33+
34+
if __name__ == "__main__":
35+
writer = get_writer()
36+
runner = LocalRunner(
37+
processor_config=ProcessorConfig(
38+
verbose=True,
39+
output_dir="local-output-to-azure-cog-search",
40+
num_processes=2,
41+
),
42+
connector_config=SimpleLocalConfig(
43+
input_path="example-docs/book-war-and-peace-1225p.txt",
44+
),
45+
read_config=ReadConfig(),
46+
partition_config=PartitionConfig(),
47+
chunking_config=ChunkingConfig(chunk_elements=True),
48+
embedding_config=EmbeddingConfig(
49+
provider="langchain-huggingface",
50+
),
51+
writer=writer,
52+
writer_kwargs={},
53+
)
54+
runner.run()
55+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
4+
5+
unstructured-ingest \
6+
local \
7+
--input-path example-docs/book-war-and-peace-1225p.txt \
8+
--output-dir local-output-to-azure-cog-search \
9+
--strategy fast \
10+
--chunk-elements \
11+
--embedding-provider "$EMBEDDING_PROVIDER" \
12+
--num-processes 2 \
13+
--verbose \
14+
azure-cognitive-search \
15+
--key "$AZURE_SEARCH_API_KEY" \
16+
--endpoint "$AZURE_SEARCH_ENDPOINT" \
17+
--index utic-test-ingest-fixtures-output
18+
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
```python
2+
import os
3+
4+
from unstructured.ingest.connector.fsspec.box import (
5+
BoxAccessConfig,
6+
BoxWriteConfig,
7+
SimpleBoxConfig,
8+
)
9+
from unstructured.ingest.connector.local import SimpleLocalConfig
10+
from unstructured.ingest.interfaces import (
11+
ChunkingConfig,
12+
EmbeddingConfig,
13+
PartitionConfig,
14+
ProcessorConfig,
15+
ReadConfig,
16+
)
17+
from unstructured.ingest.runner import LocalRunner
18+
from unstructured.ingest.runner.writers.base_writer import Writer
19+
from unstructured.ingest.runner.writers.fsspec.box import (
20+
BoxWriter,
21+
)
22+
23+
24+
def get_writer() -> Writer:
25+
return BoxWriter(
26+
connector_config=SimpleBoxConfig(
27+
access_config=BoxAccessConfig(box_app_config=os.getenv("BOX_APP_CONFIG_PATH")),
28+
remote_url="box://unstructured/war-and-peace-output",
29+
),
30+
write_config=BoxWriteConfig(),
31+
)
32+
33+
34+
if __name__ == "__main__":
35+
writer = get_writer()
36+
runner = LocalRunner(
37+
processor_config=ProcessorConfig(
38+
verbose=True,
39+
output_dir="local-output-to-box",
40+
num_processes=2,
41+
),
42+
connector_config=SimpleLocalConfig(
43+
input_path="example-docs/book-war-and-peace-1225p.txt",
44+
),
45+
read_config=ReadConfig(),
46+
partition_config=PartitionConfig(),
47+
chunking_config=ChunkingConfig(chunk_elements=True),
48+
embedding_config=EmbeddingConfig(
49+
provider="langchain-huggingface",
50+
),
51+
writer=writer,
52+
writer_kwargs={},
53+
)
54+
runner.run()
55+
```
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
```bash
2+
#!/usr/bin/env bash
3+
4+
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path example-docs/book-war-and-peace-1225p.txt \
9+
--output-dir local-output-to-box \
10+
--strategy fast \
11+
--chunk-elements \
12+
--embedding-provider "$EMBEDDING_PROVIDER" \
13+
--num-processes 2 \
14+
--verbose \
15+
box \
16+
--box_app_config "$BOX_APP_CONFIG_PATH" \
17+
--remote-url "<your destination path here, ie 'box://unstructured/war-and-peace-output'>"
18+
```

0 commit comments

Comments
 (0)