Skip to content

Commit e1c15de

Browse files
authored
Merge pull request #53 from linkml/format-utils
format utils
2 parents bded6ae + 22cf225 commit e1c15de

File tree

6 files changed

+531
-267
lines changed

6 files changed

+531
-267
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,3 +137,5 @@ dmypy.json
137137

138138
# Pyre type checker
139139
.pyre/
140+
141+
**/.claude/settings.local.json

poetry.lock

Lines changed: 452 additions & 252 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ py2neo = { version="*", optional = true }
3131
networkx = { version="*", optional = true }
3232
#chromadb = { version="*", optional = true }
3333
pyarrow = { version="*", optional = true }
34+
pyreadr = { version="*", optional = true }
3435
h5py = { version="*", optional = true }
3536
scipy = { version="*", optional = true }
3637
scikit-learn = { version="*", optional = true }
@@ -49,6 +50,7 @@ fastapi = { version="*", optional = true }
4950
uvicorn = { version="*", optional = true }
5051
xmltodict = ">=0.13.0"
5152
jsonpatch = ">=1.33"
53+
jsonpath-ng = "*"
5254
python-dotenv = "^1.0.1"
5355

5456
[tool.poetry.group.dev.dependencies]
@@ -90,6 +92,7 @@ neo4j = ["neo4j", "py2neo", "networkx"]
9092
#chromadb = ["chromadb"]
9193
h5py = ["h5py"]
9294
pyarrow = ["pyarrow"]
95+
pyreadr = ["pyreadr"]
9396
validation = ["linkml"]
9497
map = ["linkml_map"]
9598
renderer = ["linkml_renderer"]

src/linkml_store/cli.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
2626
from linkml_store.utils.object_utils import object_path_update
2727
from linkml_store.utils.pandas_utils import facet_summary_to_dataframe_unmelted
28+
from linkml_store.plotting.cli import plot_cli
2829

2930
DEFAULT_LOCAL_CONF_PATH = Path("linkml.yaml")
3031
# global path is ~/.linkml.yaml in the user's home directory
@@ -205,9 +206,10 @@ def drop(ctx):
205206
@click.option("--format", "-f", type=format_choice, help="Input format")
206207
@click.option("--object", "-i", multiple=True, help="Input object as YAML")
207208
@click.option("--source-field", help="If provided, inject file path source as this field")
209+
@click.option("--glob-files/--no-glob-files", default=False, show_default=True, help="If true, use glob to find files")
208210
@json_select_query_option
209211
@click.pass_context
210-
def insert(ctx, files, replace, object, format, source_field, json_select_query):
212+
def insert(ctx, files, glob_files, replace, object, format, source_field, json_select_query):
211213
"""Insert objects from files (JSON, YAML, TSV) into the specified collection.
212214
213215
Using a configuration:
@@ -216,6 +218,11 @@ def insert(ctx, files, replace, object, format, source_field, json_select_query)
216218
217219
Note: if you don't provide a schema this will be inferred, but it is
218220
usually better to provide an explicit schema
221+
222+
You can use --glob-files if the list of files is too long
223+
224+
linkml-store -C config.yaml -c genes insert "data/genes/*.json" --glob-files
225+
219226
"""
220227
settings = ctx.obj["settings"]
221228
collection = settings.collection
@@ -226,7 +233,15 @@ def insert(ctx, files, replace, object, format, source_field, json_select_query)
226233
load_objects_args = {}
227234
if json_select_query:
228235
load_objects_args["select_query"] = json_select_query
236+
if glob_files:
237+
import glob
238+
new_files = []
239+
for file_path in files:
240+
new_files.extend(glob.glob(file_path))
241+
logger.info(f"Found {len(new_files)} files matching glob pattern {files}")
242+
files = new_files
229243
for file_path in files:
244+
230245
if format:
231246
objects = load_objects(file_path, format=format, **load_objects_args)
232247
else:
@@ -486,12 +501,14 @@ def fq(ctx, where, limit, columns, output_type, wide, output, **kwargs):
486501
487502
Nested columns:
488503
489-
linkml-store -d phenopackets fq subject.timeAtLastEncounter.age
504+
linkml-store -d phenopackets fq -S subject.timeAtLastEncounter.age
490505
491506
Compound keys:
492507
493508
linkml-store -d phenopackets fq subject.sex+subject.timeAtLastEncounter.age
494509
510+
(TODO: compound keys do not work on solr)
511+
495512
"""
496513
collection = ctx.obj["settings"].collection
497514
where_clause = yaml.safe_load(where) if where else None
@@ -948,5 +965,7 @@ def validate(ctx, output_type, output, collection_only, **kwargs):
948965
click.echo(output_data)
949966

950967

968+
cli.add_command(plot_cli, name="plot")
969+
951970
if __name__ == "__main__":
952971
cli()

src/linkml_store/utils/format_utils.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class Format(Enum):
4444
DAT = "dat"
4545
MARKDOWN = "markdown"
4646
PKL = "pkl"
47+
RDS = "rds"
4748
PYTHON = "python"
4849
PARQUET = "parquet"
4950
HDF5 = "hdf5"
@@ -195,6 +196,9 @@ def process_file(
195196
objs = xmltodict.parse(f.read())
196197
elif format == Format.PKL:
197198
objs = pd.read_pickle(f).to_dict(orient="records")
199+
elif format == Format.RDS:
200+
import pyreadr
201+
objs = pyreadr.read_r(f)
198202
elif format == Format.XLSX:
199203
xls = pd.ExcelFile(f)
200204
objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
@@ -349,21 +353,25 @@ def load_objects(
349353
all_objects = process_file(f, format, expected_type, header_comment_token)
350354

351355
logger.debug(f"Loaded {len(all_objects)} objects from {file_path}")
352-
if select_query:
353-
import jsonpath_ng as jp
354-
355-
path_expr = jp.parse(select_query)
356-
new_objs = []
357-
for obj in all_objects:
358-
for match in path_expr.find(obj):
359-
logging.debug(f"Match: {match.value}")
360-
if isinstance(match.value, list):
361-
new_objs.extend(match.value)
362-
else:
363-
new_objs.append(match.value)
364-
all_objects = new_objs
356+
all_objects = transform_objects(all_objects, select_query)
365357
return all_objects
366358

359+
def transform_objects(all_objects: List[Dict[str, Any]], select_query: Optional[str]) -> List[Dict[str, Any]]:
360+
if not select_query:
361+
return all_objects
362+
import jsonpath_ng as jp
363+
364+
path_expr = jp.parse(select_query)
365+
new_objs = []
366+
for obj in all_objects:
367+
for match in path_expr.find(obj):
368+
logging.debug(f"Match: {match.value}")
369+
if isinstance(match.value, list):
370+
new_objs.extend(match.value)
371+
else:
372+
new_objs.append(match.value)
373+
all_objects = new_objs
374+
return all_objects
367375

368376
def write_output(
369377
data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame],

tests/test_utils/test_format_utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import pytest
88
import yaml
9+
from linkml_store.utils.format_utils import Format, load_objects, render_output, transform_objects
910

1011
from linkml_store.utils.format_utils import Format, load_objects, render_output
1112
from tests.conftest import CSV_FILE, JSON_FILE, TEST_DATA, TSV_FILE, YAML_FILE
@@ -133,3 +134,34 @@ def test_load_objects_from_tgz():
133134
for loaded_obj in matching_objects:
134135
assert loaded_obj["name"] == original_obj["name"]
135136
assert int(loaded_obj["age"]) == int(original_obj["age"]) # Convert to int for comparison
137+
138+
OBJS = [
139+
{
140+
"id": "P1",
141+
"address": {
142+
"street": "1 oak st",
143+
"city": "Oakland",
144+
},
145+
},
146+
{
147+
"id": "P2",
148+
"address": {
149+
"street": "2 spruce st",
150+
"city": "Spruceland",
151+
},
152+
},
153+
]
154+
155+
@pytest.mark.parametrize(
156+
"objects, select_expr, expected",
157+
[
158+
([], None, []),
159+
([], "x", []),
160+
(OBJS, None, OBJS),
161+
(OBJS, "id", ["P1", "P2"]),
162+
(OBJS, "address.city", ["Oakland", "Spruceland"]),
163+
]
164+
)
165+
def test_transform_objects(objects, select_expr, expected):
166+
tr_objects = transform_objects(objects, select_expr)
167+
assert tr_objects == expected

0 commit comments

Comments
 (0)