From 596196997f03034a526da664db1ec64e8495bd20 Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Wed, 1 May 2024 16:00:40 -0400
Subject: [PATCH 01/29] clastr api proof of concept
---
poetry.lock | 114 ++++++++++------
pyproject.toml | 2 +-
requirements.txt | 2 +-
strprofiler/shiny_app/clastr_api.py | 197 ++++++++++++++++++++++++++++
strprofiler/shiny_app/shiny_app.py | 77 ++++++++++-
strprofiler/utils.py | 2 +-
6 files changed, 346 insertions(+), 48 deletions(-)
create mode 100644 strprofiler/shiny_app/clastr_api.py
diff --git a/poetry.lock b/poetry.lock
index b35cc6f..d4bed28 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "anyio"
@@ -88,13 +88,13 @@ files = [
[[package]]
name = "exceptiongroup"
-version = "1.2.0"
+version = "1.2.1"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
- {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"},
- {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"},
+ {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+ {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
]
[package.extras]
@@ -391,51 +391,76 @@ files = [
[[package]]
name = "pandas"
-version = "1.5.3"
+version = "2.2.2"
description = "Powerful data structures for data analysis, time series, and statistics"
optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
files = [
- {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"},
- {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"},
- {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"},
- {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"},
- {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"},
- {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"},
- {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"},
- {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"},
- {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"},
- {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"},
- {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"},
- {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"},
- {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"},
- {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"},
- {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"},
- {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"},
- {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"},
- {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"},
- {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"},
- {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"},
- {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"},
- {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"},
- {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"},
- {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"},
- {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"},
- {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"},
- {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"},
+ {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+ {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
+ {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
+ {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
+ {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
+ {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
+ {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
+ {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
+ {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
+ {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
+ {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
+ {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
+ {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
+ {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
+ {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
+ {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
+ {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
+ {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
+ {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
+ {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
+ {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
+ {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+ {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
+ {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
+ {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
+ {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
+ {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
+ {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
+ {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
]
[package.dependencies]
numpy = [
- {version = ">=1.20.3", markers = "python_version < \"3.10\""},
- {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
- {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
+ {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+ {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+ {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
-python-dateutil = ">=2.8.1"
+python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
+tzdata = ">=2022.7"
[package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
[[package]]
name = "prompt-toolkit"
@@ -664,6 +689,17 @@ files = [
{file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
]
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+ {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+ {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
[[package]]
name = "uc-micro-py"
version = "1.0.3"
@@ -879,4 +915,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<4.0"
-content-hash = "51da1c89cdbc1170fa0c07a6d445e394791deaf26efaa86b5aa7603b8fcaabcc"
+content-hash = "bada9554d3318a11886e1e0e6d3f020beafe8e4c6db8b9249b3b38fcb6c31ef0"
diff --git a/pyproject.toml b/pyproject.toml
index ccc0b01..f83e082 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
-pandas = "^1.4.3"
+pandas = "^2.2"
rich-click = "^1.5.2"
numpy = "^1.26.3"
openpyxl = "^3.0.10"
diff --git a/requirements.txt b/requirements.txt
index 5473afd..cc6d05c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
# Automatically generated by https://github.com/damnever/pigar.
numpy==1.26.3
-pandas==1.5.3
+pandas==2.2
rich-click==1.7.3
shiny==0.8.1
shinyswatch==0.4.2
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
new file mode 100644
index 0000000..aa706c8
--- /dev/null
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -0,0 +1,197 @@
+import requests
+import json
+import pandas as pd
+from flatten_json import flatten
+
+
+def clastr_query(query, query_filter, include_amelogenin, score_filter):
+ url = "https://www.cellosaurus.org/str-search/api/query/"
+
+ if query_filter == "Tanabe":
+ query['algorithm'] = 1
+ elif query_filter == "Masters Query":
+ query['algorithm'] = 2
+ elif query_filter == "Masters Reference":
+ query['algorithm'] = 3
+
+ query['includeAmelogenin'] = include_amelogenin
+ query['scoreFilter'] = score_filter
+
+ r = requests.post(url, data=json.dumps(query))
+
+ # JSON response:
+ # 'description': '',
+ # 'cellosaurusRelease': '48.0',
+ # 'runOn': '2024-Apr-25 12:45:40 UTC+0',
+ # 'toolVersion': '1.4.4',
+ # 'searchSpace': 8581,
+ # 'parameters': {...
+ # 'results': [{ ...
+ # FULL STRUCTURE OUTLINED BELOW.
+
+ try:
+ r.raise_for_status()
+ except requests.exceptions.HTTPError as e:
+ return pd.DataFrame({"Error": [str(e)]})
+
+ df = pd.DataFrame.from_dict(r.json()['results'])
+
+ if df.empty:
+ return pd.DataFrame({"No Clastr Result": []})
+
+ flattened = [flatten(d) for d in r.json()['results']]
+ df = pd.DataFrame(flattened)
+
+ # profiles[0] has 'bestScore' returns.
+ # Markers within profiles[0] are split by each allele 'value'
+ # First select alles, and then concat alleles by return and marker
+ markers = df.filter(regex='^profiles_0_.*_value').T
+ markers[['A', 'B', 'C', 'markerID', 'E', 'F', 'G']] = markers.index.str.split('_', n=7, expand=False).tolist()
+ markers.drop(['A', 'B', 'C', 'E', 'F', 'G'], axis=1, inplace=True)
+
+ # Melt dataframe to: [markerID, resultID, allele] for cat on markerID/resultID
+ melted_markers = pd.melt(markers, id_vars=['markerID'], var_name='resultID', value_name='allele')
+
+ # Join resultID and markerID index to grouped joined allele strings.
+ allele_cat_markers = pd.concat([
+ melted_markers[['resultID', 'markerID']],
+ melted_markers.groupby(['resultID', 'markerID'], as_index=True).transform(lambda x: ','
+ .join(map(str, x)).replace(",nan", "").replace("nan", ""))
+ ], axis=1).drop_duplicates(subset=['resultID', 'markerID'])
+
+ # Marker names are not consistant across results. MarkerName[1] != the same thing in all cases.
+ # We must track marker name by index by result.
+ # The same logic from above applies, split the compount column name string,
+ # Melt on markerID, and then merge with concat allele made above.
+ # Finally, pivot into a table and rejoin to higher level results.
+ marker_names = df.filter(regex='^profiles_0_.*_name').T
+ marker_names[['A', 'B', 'C', 'markerID', 'E']] = marker_names.index.str.split('_', n=5, expand=False).tolist()
+ marker_names.drop(['A', 'B', 'C', 'E'], axis=1, inplace=True)
+
+ melted_markers = pd.melt(marker_names, id_vars=['markerID'],
+ var_name='resultID', value_name='markerName').dropna().drop_duplicates(subset=['markerID', 'resultID'])
+
+ markers_names_alleles = pd.merge(allele_cat_markers, melted_markers, how='inner', on=['markerID', 'resultID'])
+
+ pivot_markers_names_alleles = markers_names_alleles.pivot(index=['resultID'], columns='markerName', values='allele')
+
+ try:
+ merged = pd.merge(df[['accession', 'name', 'species', 'bestScore', 'problem']],
+ pivot_markers_names_alleles, left_index=True, right_on='resultID')
+ except KeyError:
+ merged = pd.merge(df[['accession', 'name', 'species', 'bestScore']], pivot_markers_names_alleles, left_index=True, right_on='resultID')
+
+ merged['accession_link'] = "https://web.expasy.org/cellosaurus/" + merged['accession']
+ print(merged)
+ # return final df
+
+ # TO DO: Add query to top of merged DF before return.
+
+ return merged
+
+
+if __name__ == '__main__':
+ # url = "https://www.cellosaurus.org/str-search/api/query/%"
+ # Use above URL for 400 error
+
+ # sample J000077451
+ data = {"Amelogenin": "X,Y",
+ "CSF1PO": "12",
+ "D2S1338": "17,19",
+ "D3S1358": "15",
+ "D5S818": "11,12",
+ "D7S820": "11,12",
+ "D8S1179": "12,15",
+ "D13S317": "8",
+ "D16S539": "13",
+ "D18S51": "14",
+ "D19S433": "14",
+ "D21S11": "31,31.2",
+ "FGA": "23",
+ "Penta D": "",
+ "Penta E": "",
+ "TH01": "7,9.3",
+ "TPOX": "8",
+ "vWA": "18",
+ }
+
+ # # stock from https://www.cellosaurus.org/str-search/help.html#5.1
+ # data = {
+ # "Amelogenin": "X",
+ # "CSF1PO": "13,14",
+ # "D5S818": "13",
+ # "D7S820": "8",
+ # "D13S317": "12",
+ # "FGA": "24",
+ # "TH01": "8",
+ # "TPOX": "11",
+ # "vWA": "16",
+ # }
+
+ r = clastr_query(data, 'Tanabe', False, 70)
+
+ print(r)
+
+# JSON data structure:
+# {
+# "description": "",
+# "cellosaurusRelease": "48.0",
+# "runOn": "2024-Apr-30 18:15:31 UTC+0",
+# "toolVersion": "1.4.4",
+# "searchSpace": 8581,
+# "parameters": {
+# "species": "Homo sapiens (Human)",
+# "algorithm": "Tanabe",
+# "scoringMode": "Non-empty makers",
+# "scoreFilter": 70,
+# "minMarkers": 8,
+# "maxResults": 200,
+# "includeAmelogenin": false,
+# "markers": [ {
+# "name": "Amelogenin",
+# "alleles": [
+# {
+# "value": "X"
+# },
+# {
+# "value": "Y"
+# }
+# ]
+# }, ... ]
+# },
+# "results": [
+# {
+# "accession": "CVCL_2335",
+# "name": "CCD-1076Sk",
+# "species": "Homo sapiens (Human)",
+# "bestScore": 72.0,
+# "problematic": false,
+# "profiles": [
+# {
+# "score": 72.0,
+# "markerNumber": 8,
+# "alleleNumber": 14,
+# "markers": [
+# {
+# "name": "Amelogenin",
+# "conflicted": false,
+# "searched": true,
+# "sources": [],
+# "alleles": [
+# {
+# "value": "X",
+# "matched": true
+# },
+# {
+# "value": "Y",
+# "matched": true
+# }
+# ]
+# },
+# ...
+# }
+#
+#
+# }
+#
+# ]
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index 7ec1ce0..daf4869 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -6,6 +6,7 @@
import strprofiler.utils as sp
from strprofiler.shiny_app.calc_functions import _single_query, _batch_query, _file_query
+from strprofiler.shiny_app.clastr_api import clastr_query
from datetime import date
import time
@@ -65,9 +66,15 @@ def _highlight_non_matches(s):
is_match = s == s.iloc[0]
return ["text-align:center;background-color:#ec7a80" if not v else "text-align:center" for v in is_match]
-# App Generation ###
+
+def _link_wrap(name, link, problem):
+ if not pd.isna(problem):
+ return ui.tooltip(ui.tags.a(name, href=str(link), target="_blank", style="text-align:center;font-style:oblique;color:#ec7a80"), f"{problem}")
+ else:
+ return ui.tags.a(name, href=str(link), target="_blank")
+# App Generation ###
def create_app(db=None):
f = importlib.resources.files("strprofiler.shiny_app")
@@ -93,7 +100,6 @@ def create_app(db=None):
)
)
- # TODO move this to a separate function
app_ui = ui.page_fluid(
ui.tags.style("#main {padding:12px !important} #sidebar {padding:12px}"),
ui.tags.style(
@@ -188,6 +194,12 @@ def create_app(db=None):
class_="btn-danger",
width="45%",
),
+ ui.input_action_button(
+ "clastr",
+ "Clastr",
+ class_="btn-success",
+ width="45%",
+ ),
),
),
),
@@ -199,10 +211,23 @@ def create_app(db=None):
ui.column(3, ui.tags.h3("Results")),
ui.column(1, ui.p("")),
),
- ui.column(
- 12,
- {"id": "res_card"},
- ui.output_table("out_result"),
+ ui.navset_card_tab(
+ ui.nav_panel(
+ "STR Profiler",
+ ui.column(
+ 12,
+ {"id": "res_card"},
+ ui.output_table("out_result"),
+ ),
+ ),
+ ui.nav_panel(
+ "CLASTR",
+ ui.column(
+ 12,
+ {"id": "res_card"},
+ ui.output_table("clastr_table"),
+ ),
+ ),
),
full_screen=False,
fill=False,
@@ -416,6 +441,7 @@ def server(input, output, session):
str_database = reactive.value(init_db)
db_name = reactive.value(init_db_name)
output_df = reactive.value(None)
+ output_df_clastr = reactive.value(None)
demo_vals = reactive.value(None)
demo_name = reactive.value(None)
markers = reactive.value([i for i in list(init_db[next(iter(init_db))].keys()) if not any([e for e in ['Center', 'Passage'] if e in i])])
@@ -554,6 +580,45 @@ def loaded_example_text():
x = ui.strong("")
return x
+ @reactive.calc
+ @reactive.event(input.clastr)
+ def clastr_results():
+ query = {m: input[m]() for m in markers()}
+ thinking = ui.notification_show("Message ", duration=None)
+ clastr_return = clastr_query(query, input.query_filter(), input.score_amel_query(), input.query_filter_threshold())
+ ui.notification_remove(thinking)
+ return clastr_return
+
+ @output
+ @render.table
+ def clastr_table():
+ output_df_clastr.set(clastr_results())
+ if output_df_clastr() is not None:
+ out_df = output_df_clastr().copy()
+ print(out_df)
+ if ('No Clastr Result' in out_df.columns) | ('Error' in out_df.columns):
+ return out_df
+ try:
+ out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, x.problem), axis=1)
+ except Exception:
+ out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, pd.NA), axis=1)
+ out_df = out_df.drop(['accession', 'accession_link', 'species'], axis=1).rename(
+ columns={"link": "Accession", "name": "Name", "bestScore": "Score"})
+ cols = list(out_df.columns)
+ cols = [cols[-1]] + cols[:-1]
+ out_df = out_df[cols]
+ out_df = out_df.style.set_table_attributes(
+ 'class="dataframe shiny-table table w-auto"'
+ ).hide(axis="index").format(
+ {
+ "Score": "{0:0.2f}",
+ },
+ na_rep=""
+ )
+ else:
+ out_df = pd.DataFrame({"No input provided.": []})
+ return out_df
+
# Dealing with calculating a results table
# Catch when either reset or search is clicked
# If reset, clear the query and run to make an empty df.
diff --git a/strprofiler/utils.py b/strprofiler/utils.py
index 33e6ffa..987128f 100644
--- a/strprofiler/utils.py
+++ b/strprofiler/utils.py
@@ -187,7 +187,7 @@ def str_ingress(
else:
sys.exit('File extension: ' + path.suffix + ' in file: ' + str(path) + ' is not supported.')
- df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+ df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df.columns = df.columns.str.strip()
From 9916ad9005fab5dac18349037d1a83d608f5ccaa Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 2 May 2024 14:40:40 -0400
Subject: [PATCH 02/29] query row added, UI adjustment
---
app.py | 3 +
strprofiler/shiny_app/clastr_api.py | 23 +++-
strprofiler/shiny_app/shiny_app.py | 160 +++++++++++++---------------
3 files changed, 94 insertions(+), 92 deletions(-)
create mode 100644 app.py
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..69d5d26
--- /dev/null
+++ b/app.py
@@ -0,0 +1,3 @@
+from strprofiler.shiny_app.shiny_app import create_app
+
+app = create_app()
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index aa706c8..6a7386b 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -7,6 +7,10 @@
def clastr_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/query/"
+ dct = {k: [v] for k, v in query.items()}
+ query_df = pd.DataFrame(dct)
+ query_df['accession'] = 'Query'
+
if query_filter == "Tanabe":
query['algorithm'] = 1
elif query_filter == "Masters Query":
@@ -82,12 +86,23 @@ def clastr_query(query, query_filter, include_amelogenin, score_filter):
merged = pd.merge(df[['accession', 'name', 'species', 'bestScore']], pivot_markers_names_alleles, left_index=True, right_on='resultID')
merged['accession_link'] = "https://web.expasy.org/cellosaurus/" + merged['accession']
- print(merged)
- # return final df
- # TO DO: Add query to top of merged DF before return.
+ # add the query line to the top of merged, and reorder columns
+
+ query_added = pd.concat([query_df, merged]).reset_index(drop=True)
+ query_added["bestScore"] = query_added['bestScore'].map("{0:.2f}".format).replace("nan", "")
+
+ # print(query_added.columns)
+
+ if 'problem' in query_added.columns:
+ query_added = query_added[['accession', 'name', 'species', 'bestScore', 'accession_link', 'problem'] +
+ [c for c in query_added if c not in
+ ['accession', 'name', 'species', 'bestScore', 'accession_link', 'problem']]].fillna('')
+ else:
+ query_added = query_added[['accession', 'name', 'species', 'bestScore', 'accession_link'] +
+ [c for c in query_added if c not in ['accession', 'name', 'species', 'bestScore', 'accession_link']]].fillna('')
- return merged
+ return query_added
if __name__ == '__main__':
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index daf4869..0d08163 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -68,7 +68,9 @@ def _highlight_non_matches(s):
def _link_wrap(name, link, problem):
- if not pd.isna(problem):
+ if name == 'Query':
+ return name
+ if problem != "":
return ui.tooltip(ui.tags.a(name, href=str(link), target="_blank", style="text-align:center;font-style:oblique;color:#ec7a80"), f"{problem}")
else:
return ui.tags.a(name, href=str(link), target="_blank")
@@ -182,6 +184,12 @@ def create_app(db=None):
ui.column(4, ui.output_ui("loaded_example_text")),
ui.column(
4,
+ ui.input_select(
+ "search_type",
+ "Search Type",
+ ["STR DB", "CLASTR"],
+ width="90%"
+ ),
ui.input_action_button(
"search",
"Search",
@@ -194,12 +202,6 @@ def create_app(db=None):
class_="btn-danger",
width="45%",
),
- ui.input_action_button(
- "clastr",
- "Clastr",
- class_="btn-success",
- width="45%",
- ),
),
),
),
@@ -211,23 +213,10 @@ def create_app(db=None):
ui.column(3, ui.tags.h3("Results")),
ui.column(1, ui.p("")),
),
- ui.navset_card_tab(
- ui.nav_panel(
- "STR Profiler",
- ui.column(
- 12,
- {"id": "res_card"},
- ui.output_table("out_result"),
- ),
- ),
- ui.nav_panel(
- "CLASTR",
- ui.column(
- 12,
- {"id": "res_card"},
- ui.output_table("clastr_table"),
- ),
- ),
+ ui.column(
+ 12,
+ {"id": "res_card"},
+ ui.output_table("out_result"),
),
full_screen=False,
fill=False,
@@ -418,7 +407,7 @@ def create_app(db=None):
icon_svg("github", width="30px"),
href="https://github.com/j-andrews7/strprofiler",
target="_blank",
- )
+ ),
),
title=ui.tags.a(
ui.tags.img(
@@ -441,7 +430,6 @@ def server(input, output, session):
str_database = reactive.value(init_db)
db_name = reactive.value(init_db_name)
output_df = reactive.value(None)
- output_df_clastr = reactive.value(None)
demo_vals = reactive.value(None)
demo_name = reactive.value(None)
markers = reactive.value([i for i in list(init_db[next(iter(init_db))].keys()) if not any([e for e in ['Center', 'Passage'] if e in i])])
@@ -580,45 +568,6 @@ def loaded_example_text():
x = ui.strong("")
return x
- @reactive.calc
- @reactive.event(input.clastr)
- def clastr_results():
- query = {m: input[m]() for m in markers()}
- thinking = ui.notification_show("Message ", duration=None)
- clastr_return = clastr_query(query, input.query_filter(), input.score_amel_query(), input.query_filter_threshold())
- ui.notification_remove(thinking)
- return clastr_return
-
- @output
- @render.table
- def clastr_table():
- output_df_clastr.set(clastr_results())
- if output_df_clastr() is not None:
- out_df = output_df_clastr().copy()
- print(out_df)
- if ('No Clastr Result' in out_df.columns) | ('Error' in out_df.columns):
- return out_df
- try:
- out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, x.problem), axis=1)
- except Exception:
- out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, pd.NA), axis=1)
- out_df = out_df.drop(['accession', 'accession_link', 'species'], axis=1).rename(
- columns={"link": "Accession", "name": "Name", "bestScore": "Score"})
- cols = list(out_df.columns)
- cols = [cols[-1]] + cols[:-1]
- out_df = out_df[cols]
- out_df = out_df.style.set_table_attributes(
- 'class="dataframe shiny-table table w-auto"'
- ).hide(axis="index").format(
- {
- "Score": "{0:0.2f}",
- },
- na_rep=""
- )
- else:
- out_df = pd.DataFrame({"No input provided.": []})
- return out_df
-
# Dealing with calculating a results table
# Catch when either reset or search is clicked
# If reset, clear the query and run to make an empty df.
@@ -650,7 +599,6 @@ def loaded_example_text():
ui.remove_ui("#inserted-downloader")
res_click.set(0)
-
return None
if res_click() == 0:
ui.insert_ui(
@@ -664,34 +612,70 @@ def loaded_example_text():
where="afterEnd",
)
res_click.set(1)
-
- return _single_query(
- query,
- str_database(),
- input.score_amel_query(),
- input.mix_threshold_query(),
- input.query_filter(),
- input.query_filter_threshold(),
- )
+ thinking = ui.notification_show("Message: API Query Running.", duration=None)
+ # isolate input.search_type to prevent trigger when options change.
+ with reactive.isolate():
+ if input.search_type() == 'STR DB':
+ results = _single_query(
+ query,
+ str_database(),
+ input.score_amel_query(),
+ input.mix_threshold_query(),
+ input.query_filter(),
+ input.query_filter_threshold(),
+ )
+ elif input.search_type() == 'CLASTR':
+ results = clastr_query(
+ query,
+ input.query_filter(),
+ input.score_amel_query(),
+ input.query_filter_threshold()
+ )
+ ui.notification_remove(thinking)
+ return results
@output
@render.table
def out_result():
output_df.set(output_results())
if output_df() is not None:
- out_df = output_df().copy()
- out_df = out_df.style.set_table_attributes(
- 'class="dataframe shiny-table table w-auto"'
- ).hide(axis="index").apply(_highlight_non_matches, subset=markers(), axis=0).format(
- {
- "Shared Markers": "{0:0.0f}",
- "Shared Alleles": "{0:0.0f}",
- "Tanabe Score": "{0:0.2f}",
- "Masters Query Score": "{0:0.2f}",
- "Masters Ref Score": "{0:0.2f}",
- },
- na_rep=""
- )
+ # isolate input.search_type to prevent trigger when options change.
+ with reactive.isolate():
+ if input.search_type() == 'STR DB':
+ out_df = output_df().copy()
+ out_df = out_df.style.set_table_attributes(
+ 'class="dataframe shiny-table table w-auto"'
+ ).hide(axis="index").apply(_highlight_non_matches, subset=markers(), axis=0).format(
+ {
+ "Shared Markers": "{0:0.0f}",
+ "Shared Alleles": "{0:0.0f}",
+ "Tanabe Score": "{0:0.2f}",
+ "Masters Query Score": "{0:0.2f}",
+ "Masters Ref Score": "{0:0.2f}",
+ },
+ na_rep=""
+ )
+ elif input.search_type() == 'CLASTR':
+ out_df = output_df().copy()
+ print(out_df)
+ if ('No Clastr Result' in out_df.columns) | ('Error' in out_df.columns):
+ return out_df
+ try:
+ out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, x.problem), axis=1)
+ out_df.drop(columns=['problem'], inplace=True)
+ except Exception:
+ out_df['link'] = out_df.apply(lambda x: _link_wrap(x.accession, x.accession_link, ''), axis=1)
+
+ out_df = out_df.drop(['accession', 'accession_link', 'species'], axis=1).rename(
+ columns={"link": "Accession", "name": "Name", "bestScore": "Score"})
+
+ cols = list(out_df.columns)
+ cols = [cols[-1]] + cols[:-1]
+
+ out_df = out_df[cols]
+ out_df = out_df.style.set_table_attributes(
+ 'class="dataframe shiny-table table w-auto"'
+ ).hide(axis="index").apply(_highlight_non_matches, subset=markers(), axis=0)
else:
out_df = pd.DataFrame({"No input provided.": []})
return out_df
From 9bba28fc359420b1cedd480100d39150b8b4e90b Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 2 May 2024 15:51:20 -0400
Subject: [PATCH 03/29] tooltip added, help updated, req for deploy updated
---
requirements.txt | 5 ++-
strprofiler/shiny_app/shiny_app.py | 33 +++++++++-----
strprofiler/shiny_app/www/help.html | 69 ++++++++++++++++++++++++++---
strprofiler/shiny_app/www/help.md | 31 ++++++++++---
4 files changed, 115 insertions(+), 23 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index cc6d05c..9ebff4b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,10 @@
# Automatically generated by https://github.com/damnever/pigar.
+faicons==0.2.2
+flatten-json==0.1.14
numpy==1.26.3
-pandas==2.2
+pandas==2.2.2
+requests==2.31.0
rich-click==1.7.3
shiny==0.8.1
shinyswatch==0.4.2
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index 0d08163..397fac2 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -187,14 +187,19 @@ def create_app(db=None):
ui.input_select(
"search_type",
"Search Type",
- ["STR DB", "CLASTR"],
+ ["STRprofiler Database", "Cellosaurus Database (CLASTR)"],
width="90%"
),
- ui.input_action_button(
- "search",
- "Search",
- class_="btn-success",
- width="45%",
+ ui.tooltip(
+ ui.input_action_button(
+ "search",
+ "Search",
+ class_="btn-success",
+ width="45%",
+ ),
+ "Query STRprofilier Database",
+ id="tt_selected_search",
+ placement="left",
),
ui.input_action_button(
"reset",
@@ -450,6 +455,14 @@ def database_file():
width="100%",
)
+ @reactive.effect
+ @reactive.event(input.search_type)
+ def update_tooltip_msg():
+ if input.search_type() == 'STRprofiler Database':
+ ui.update_tooltip("tt_selected_search", 'Query STRprofilier Database', show=False)
+ if input.search_type() == 'Cellosaurus Database (CLASTR)':
+ ui.update_tooltip("tt_selected_search", 'Query Cellosaurus Database via CLASTR API', show=False)
+
@render.ui
@reactive.event(markers)
def marker_inputs():
@@ -615,7 +628,7 @@ def loaded_example_text():
thinking = ui.notification_show("Message: API Query Running.", duration=None)
# isolate input.search_type to prevent trigger when options change.
with reactive.isolate():
- if input.search_type() == 'STR DB':
+ if input.search_type() == 'STRprofiler Database':
results = _single_query(
query,
str_database(),
@@ -624,7 +637,7 @@ def loaded_example_text():
input.query_filter(),
input.query_filter_threshold(),
)
- elif input.search_type() == 'CLASTR':
+ elif input.search_type() == 'Cellosaurus Database (CLASTR)':
results = clastr_query(
query,
input.query_filter(),
@@ -641,7 +654,7 @@ def out_result():
if output_df() is not None:
# isolate input.search_type to prevent trigger when options change.
with reactive.isolate():
- if input.search_type() == 'STR DB':
+ if input.search_type() == 'STRprofiler Database':
out_df = output_df().copy()
out_df = out_df.style.set_table_attributes(
'class="dataframe shiny-table table w-auto"'
@@ -655,7 +668,7 @@ def out_result():
},
na_rep=""
)
- elif input.search_type() == 'CLASTR':
+ elif input.search_type() == 'Cellosaurus Database (CLASTR)':
out_df = output_df().copy()
print(out_df)
if ('No Clastr Result' in out_df.columns) | ('Error' in out_df.columns):
diff --git a/strprofiler/shiny_app/www/help.html b/strprofiler/shiny_app/www/help.html
index 6c1bdf1..513700f 100644
--- a/strprofiler/shiny_app/www/help.html
+++ b/strprofiler/shiny_app/www/help.html
@@ -1524,12 +1524,20 @@ Default Database
Laboratory PDX program
If this app is hosted with a custom database, please contact the host
for information on the database source.
+
+
+
CLASTR / Cellosaurus API Query
+
Query of the Cellosaurus
+(Bairoch, 2018) cell line database is also available for single samples
+via the CLASTR
+(Robin, Capes-Davis, and Bairoch, 2019) REST
+API.
Single Query Report
For individual samples, a report is generated with the following
-fields.
+fields when ‘STR DB’ is selected as the search type.
@@ -1562,17 +1570,59 @@ Single Query Report
Tanabe Score |
Tanabe similarity score between the query and database
-sample. |
+sample (if Tanabe selected).
Master Query Score |
Master ‘Query’ similarity score between the query and
-database sample. |
+database sample (if Master Query selected).
Master Ref Score |
Master ‘Reference’ similarity score between the query
-and database sample. |
+and database sample (if Master Ref selected).
+
+
+Markers 1 … n |
+Marker alleles with mismatches highlight. |
+
+
+
+
The report is filtered to include only those samples with greater
+than or equal to the Similarity Score Filter Threshold
+defined by the user, and report only the similarity score selected.
+
When ‘CLASTR’ is selected as the search type, a report is generated
+with the following fields:
+
+
+
+
+
+
+
+
+
+
+Accession |
+Cellosaurus cell line accession ID. Links are provided
+to each accession information page. |
+
+
+Name |
+Cell line name. |
+
+
+Score |
+Similarity score between the query and cell line
+sample. Reported score reflectes the selected Similarity Score
+Filter. |
+
+
+Markers 1 … n |
+Marker alleles with mismatches highlight. |
@@ -1716,14 +1766,21 @@
Batch and File Query Specfic
-
-
Reference
+
+
References
strprofiler
is provided under the MIT license. If you
use this app in your research please cite:
Jared Andrews, Mike Lloyd, & Sam Culley. (2024).
j-andrews7/strprofiler:
v0.2.0. Zenodo.
https://doi.org/10.5281/zenodo.10544686
+
Bairoch A. (2018) The Cellosaurus, a cell line knowledge resource.
+Journal of Biomolecular Techniques. 29:25-38. DOI:
+10.7171/jbt.18-2902-002; PMID: 29805321
+
Robin, T., Capes-Davis, A. & Bairoch, A. (2019) CLASTR: the
+Cellosaurus STR Similarity Search Tool - A Precious Help for Cell Line
+Authentication. International Journal of Cancer. PubMed: 31444973 DOI:
+10.1002/IJC.32639
diff --git a/strprofiler/shiny_app/www/help.md b/strprofiler/shiny_app/www/help.md
index c6b61f1..7aac4c7 100644
--- a/strprofiler/shiny_app/www/help.md
+++ b/strprofiler/shiny_app/www/help.md
@@ -20,21 +20,36 @@ The report will differ depending on if an individual sample or batch of samples
## Default Database
Current data underlying the default database were provided by: [The Jackson Laboratory PDX program](https://tumor.informatics.jax.org/mtbwi/pdxSearch.do)
-If this app is hosted with a custom database, please contact the host for information on the database source.
+If this app is hosted with a custom database, please contact the host for information on the database source.
+
+## CLASTR / Cellosaurus API Query
+Query of the [Cellosaurus](https://www.cellosaurus.org/description.html) (Bairoch, 2018) cell line database is also available for single samples via the [CLASTR](https://www.cellosaurus.org/str-search/) (Robin, Capes-Davis, and Bairoch, 2019) [REST API](https://www.cellosaurus.org/str-search/help.html#5).
---
## Single Query Report
-For individual samples, a report is generated with the following fields.
+For individual samples, a report is generated with the following fields when 'STR DB' is selected as the search type.
| Output Field | Description |
| :--- | :---- |
| Mixed Sample | Flag to indicate sample mixing. Sample mixing is determined by the "'Mixed' Sample Threshold" option. If more markers are tri+ allelic than the threshold, samples are flagged as potentially mixed. |
| Shared Markers | Number of markers shared between the query and database sample. |
| Shared Alleles | Number of alleles shared between the query and database sample. |
-| Tanabe Score | Tanabe similarity score between the query and database sample. |
-| Master Query Score | Master 'Query' similarity score between the query and database sample. |
-| Master Ref Score | Master 'Reference' similarity score between the query and database sample. |
+| Tanabe Score | Tanabe similarity score between the query and database sample (if Tanabe selected). |
+| Master Query Score | Master 'Query' similarity score between the query and database sample (if Master Query selected). |
+| Master Ref Score | Master 'Reference' similarity score between the query and database sample (if Master Ref selected). |
+| Markers 1 ... n | Marker alleles with mismatches highlight. |
+
+The report is filtered to include only those samples with greater than or equal to the `Similarity Score Filter Threshold` defined by the user, and report only the similarity score selected.
+
+When 'CLASTR' is selected as the search type, a report is generated with the following fields:
+
+| Output Field | Description |
+| :--- | :---- |
+| Accession | Cellosaurus cell line accession ID. Links are provided to each accession information page. |
+| Name | Cell line name. |
+| Score | Similarity score between the query and cell line sample. Reported score reflectes the selected Similarity Score Filter. |
+| Markers 1 ... n | Marker alleles with mismatches highlight. |
The report is filtered to include only those samples with greater than or equal to the `Similarity Score Filter Threshold` defined by the user.
@@ -108,7 +123,11 @@ For batch samples entered in the File Query tab, `STR Similarity` will generate
---
-# Reference
+# References
`strprofiler` is provided under the MIT license. If you use this app in your research please cite:
Jared Andrews, Mike Lloyd, & Sam Culley. (2024).
j-andrews7/strprofiler: v0.2.0. Zenodo.
https://doi.org/10.5281/zenodo.10544686
+
+Bairoch A. (2018) The Cellosaurus, a cell line knowledge resource. Journal of Biomolecular Techniques. 29:25-38. DOI: 10.7171/jbt.18-2902-002; PMID: 29805321
+
+Robin, T., Capes-Davis, A. & Bairoch, A. (2019) CLASTR: the Cellosaurus STR Similarity Search Tool - A Precious Help for Cell Line Authentication. International Journal of Cancer. PubMed: 31444973 DOI: 10.1002/IJC.32639
\ No newline at end of file
From 1cd2ef4b07220af54eaac4f831be246bbc7c2e3a Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Fri, 3 May 2024 08:39:03 -0400
Subject: [PATCH 04/29] add window title
---
strprofiler/shiny_app/shiny_app.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index 397fac2..17fcbfd 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -103,6 +103,7 @@ def create_app(db=None):
)
app_ui = ui.page_fluid(
+ ui.panel_title('', "STR Profiler"),
ui.tags.style("#main {padding:12px !important} #sidebar {padding:12px}"),
ui.tags.style(
".h3 {margin-bottom:0.1rem; line-height:1} .card-body {padding-top:6px; padding-bottom:6px}"
From 86453204cefdb0f6962e7073d780f3b3e9aa9cf4 Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 9 May 2024 15:33:28 -0400
Subject: [PATCH 05/29] clastr batch method rough in
---
strprofiler/shiny_app/clastr_api.py | 62 +++++++++++-
strprofiler/shiny_app/shiny_app.py | 142 +++++++++++++++++++---------
2 files changed, 154 insertions(+), 50 deletions(-)
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index 6a7386b..0082298 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -4,7 +4,7 @@
from flatten_json import flatten
-def clastr_query(query, query_filter, include_amelogenin, score_filter):
+def _clastr_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/query/"
dct = {k: [v] for k, v in query.items()}
@@ -65,7 +65,7 @@ def clastr_query(query, query_filter, include_amelogenin, score_filter):
# Marker names are not consistant across results. MarkerName[1] != the same thing in all cases.
# We must track marker name by index by result.
- # The same logic from above applies, split the compount column name string,
+ # The same logic from above applies, split the compound column name string,
# Melt on markerID, and then merge with concat allele made above.
# Finally, pivot into a table and rejoin to higher level results.
marker_names = df.filter(regex='^profiles_0_.*_name').T
@@ -105,6 +105,30 @@ def clastr_query(query, query_filter, include_amelogenin, score_filter):
return query_added
+def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
+ url = "https://www.cellosaurus.org/str-search/api/batch/"
+
+ if query_filter == "Tanabe":
+ query = [dict(item, **{'algorithm': 1}) for item in query]
+ elif query_filter == "Masters Query":
+ query = [dict(item, **{'algorithm': 2}) for item in query]
+ elif query_filter == "Masters Reference":
+ query = [dict(item, **{'algorithm': 2}) for item in query]
+
+ query = [dict(item, **{'includeAmelogenin': include_amelogenin}) for item in query]
+ query = [dict(item, **{'scoreFilter': score_filter}) for item in query]
+ query = [dict(item, **{'outputFormat': 'xlsx'}) for item in query]
+
+ r = requests.post(url, data=json.dumps(query))
+
+ try:
+ r.raise_for_status()
+ except requests.exceptions.HTTPError as e:
+ return pd.DataFrame({"Error": [str(e)]})
+
+ return r
+
+
if __name__ == '__main__':
# url = "https://www.cellosaurus.org/str-search/api/query/%"
# Use above URL for 400 error
@@ -143,10 +167,42 @@ def clastr_query(query, query_filter, include_amelogenin, score_filter):
# "vWA": "16",
# }
- r = clastr_query(data, 'Tanabe', False, 70)
+ r = _clastr_query(data, 'Tanabe', False, 70)
print(r)
+ batch_data = [{
+ "description": "Example 1",
+ "Amelogenin": "X",
+ "CSF1PO": "13,14",
+ "D5S818": "13",
+ "D7S820": "8",
+ "D13S317": "12",
+ "FGA": "24",
+ "TH01": "8",
+ "TPOX": "11",
+ "vWA": "16",
+ }, {
+ "description": "Example 2",
+ "Amelogenin": "X, Y",
+ "CSF1PO": "13",
+ "D5S818": "13, 14",
+ "D7S820": "8, 19",
+ "D13S317": "11, 12",
+ "FGA": "24",
+ "TH01": "8",
+ "TPOX": "11",
+ "vWA": "15",
+ "outputFormat": "xlsx"
+ }]
+
+ r = _clastr_batch_query(batch_data, 'Tanabe', False, 70)
+
+ with open('testing.xlsx', 'wb') as fd:
+ for chunk in r.iter_content(chunk_size=128):
+ fd.write(chunk)
+
+
# JSON data structure:
# {
# "description": "",
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index 17fcbfd..b977253 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -6,7 +6,7 @@
import strprofiler.utils as sp
from strprofiler.shiny_app.calc_functions import _single_query, _batch_query, _file_query
-from strprofiler.shiny_app.clastr_api import clastr_query
+from strprofiler.shiny_app.clastr_api import _clastr_query, _clastr_batch_query
from datetime import date
import time
@@ -272,6 +272,12 @@ def create_app(db=None):
multiple=False,
width="100%",
),
+ ui.input_select(
+ "search_type_batch",
+ "Search Type",
+ ["STRprofiler Database", "Cellosaurus Database (CLASTR)"],
+ width="100%"
+ ),
ui.input_action_button(
"csv_query",
"CSV Query",
@@ -639,7 +645,7 @@ def loaded_example_text():
input.query_filter_threshold(),
)
elif input.search_type() == 'Cellosaurus Database (CLASTR)':
- results = clastr_query(
+ results = _clastr_query(
query,
input.query_filter(),
input.score_amel_query(),
@@ -693,6 +699,7 @@ def out_result():
else:
out_df = pd.DataFrame({"No input provided.": []})
return out_df
+ # TO DO: Remove results table when changing query methods.
# Dealing with downloading results, when requested.
# Note that output_results() is a reactive Calc result.
@@ -718,26 +725,31 @@ def download():
@render.data_frame
def out_batch_df():
output_df.set(batch_query_results())
- try:
- return render.DataTable(output_df())
- except Exception:
- m = ui.modal(
- ui.div(
- {"style": "font-size: 18px"},
- ui.HTML(
- (
- "There was a fatal error in the query.
"
- "Ensure marker names match expectation, and that"
- " no special characters (spaces, etc.) were used in sample names."
- )
- ),
- ),
- title="Batch Query Error",
- easy_close=True,
- footer=None,
- )
- ui.modal_show(m)
- return render.DataTable(pd.DataFrame({"Failed Query. Fix Input File": []}))
+ print(output_df)
+ with reactive.isolate():
+ if input.search_type_batch() == 'STRprofiler Database':
+ try:
+ return render.DataTable(output_df())
+ except Exception:
+ m = ui.modal(
+ ui.div(
+ {"style": "font-size: 18px"},
+ ui.HTML(
+ (
+ "There was a fatal error in the query.
"
+ "Ensure marker names match expectation, and that"
+ " no special characters (spaces, etc.) were used in sample names."
+ )
+ ),
+ ),
+ title="Batch Query Error",
+ easy_close=True,
+ footer=None,
+ )
+ ui.modal_show(m)
+ return render.DataTable(pd.DataFrame({"Failed Query. Fix Input File": []}))
+ elif input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
+ return render.DataTable(pd.DataFrame({"CASTR Batch Query": ['Download Results']}))
# File input loading
@reactive.calc
@@ -776,39 +788,75 @@ def batch_query_results():
return pd.DataFrame({"Failed Query. Fix Input File": []})
if res_click_file() == 0:
- ui.insert_ui(
- ui.div(
- {"id": "inserted-downloader2"},
- ui.download_button(
- "download2", "Download CSV", width="25%", class_="btn-primary"
+ if input.search_type_batch() == 'STRprofiler Database':
+ ui.insert_ui(
+ ui.div(
+ {"id": "inserted-downloader2"},
+ ui.download_button(
+ "download2", "Download CSV", width="25%", class_="btn-primary"
+ ),
),
- ),
- selector="#res_card_batch",
- where="beforeEnd",
- )
- res_click_file.set(1)
- return _batch_query(
- query_df,
- str_database(),
- input.score_amel_batch(),
- input.mix_threshold_batch(),
- input.tan_threshold_batch(),
- input.mas_q_threshold_batch(),
- input.mas_r_threshold_batch(),
- )
+ selector="#res_card_batch",
+ where="beforeEnd",
+ )
+ res_click_file.set(1)
+ elif input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
+ ui.insert_ui(
+ ui.div(
+ {"id": "inserted-downloader2"},
+ ui.download_button(
+ "download2", "Download XLSX", width="25%", class_="btn-primary"
+ ),
+ ),
+ selector="#res_card_batch",
+ where="beforeEnd",
+ )
+ res_click_file.set(1)
+
+ with reactive.isolate():
+ if input.search_type_batch() == 'STRprofiler Database':
+ results = _batch_query(
+ query_df,
+ str_database(),
+ input.score_amel_batch(),
+ input.mix_threshold_batch(),
+ input.tan_threshold_batch(),
+ input.mas_q_threshold_batch(),
+ input.mas_r_threshold_batch(),
+ )
+ elif input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
+ clastr_query = [(lambda d: d.update(description=key) or d)(val) for (key, val) in query_df.items()]
+ results = _clastr_batch_query(
+ clastr_query,
+ input.query_filter(),
+ input.score_amel_batch(),
+ input.query_filter_threshold()
+ )
+ # TO DO: Change to a batch filter option set.
+ return results
+
+ # File input loading
+ @reactive.effect
+ @reactive.event(input.search_type_batch)
+ def _():
+ ui.remove_ui("#inserted-downloader2")
+ res_click_file.set(0)
+ # TO DO: Remove batch results table when changing methods.
# Dealing with dowloading results, when requested.
# Note that batch_query_results() is a reactive Calc result.
@render.download(
- filename="STR_Batch_Results_"
- + date.today().isoformat()
- + "_"
- + time.strftime("%Hh-%Mm", time.localtime())
- + ".csv"
+ filename=lambda: "STR_Batch_Results_" + date.today().isoformat() + "_" + time.strftime("%Hh-%Mm", time.localtime()) + ".csv"
+ if f"{input.search_type_batch()}" == 'STRprofiler Database'
+ else "STR_Batch_Results_" + date.today().isoformat() + "_" + time.strftime("%Hh-%Mm", time.localtime()) + ".xlsx"
)
def download2():
if batch_query_results() is not None:
- yield batch_query_results().to_csv(index=False)
+ if input.search_type_batch() == 'STRprofiler Database':
+ yield batch_query_results().to_csv(index=False)
+ if input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
+ for chunk in batch_query_results().iter_content(chunk_size=128):
+ yield chunk
# Dealing with passing example file to user.
@render.download()
From 1bd91a99be750effb4dc609d3bd1d5df26ddeac6 Mon Sep 17 00:00:00 2001
From: Jared Andrews
Date: Tue, 14 May 2024 12:32:45 -0500
Subject: [PATCH 06/29] add requirements, bump version
---
.gitignore | 1 +
CHANGELOG.md | 7 +++++++
docs/requirements.txt | 4 +++-
pyproject.toml | 4 +++-
requirements.txt | 3 ++-
5 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index bad588f..719d172 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
+.conda/*
# PyInstaller
# Usually these files are written by a python script from a template
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0ea17a6..ef1f28f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
# Changelog
+## v0.3.0
+
+**Release date:**
+
+ - Added ability to query the CLASTR API for single or batch queries from within the STRprofiler
+ app - [#24](https://github.com/j-andrews7/strprofiler/pull/24).
+
## v0.2.0
**Release date: 04/16/2024**
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 367585e..c262090 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,4 +4,6 @@ myst-parser
rich-click
shiny
shinyswatch
-faicons
\ No newline at end of file
+faicons
+requests
+flatten-json
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index f83e082..834af42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "strprofiler"
-version = "0.2.0"
+version = "0.3.0"
description = "A simple python utility to compare short tandem repeat (STR) profiles."
authors = ["Jared Andrews ",
"Mike Lloyd "]
@@ -18,6 +18,8 @@ shiny = "^0.8.0"
shinyswatch = "^0.4.2"
Jinja2 = "^3.1.3"
faicons = "^0.2.2"
+requests = "^2.31.0"
+flatten-json = "^0.1.14"
[tool.poetry.dev-dependencies]
diff --git a/requirements.txt b/requirements.txt
index 9ebff4b..daba595 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ requests==2.31.0
rich-click==1.7.3
shiny==0.8.1
shinyswatch==0.4.2
-Jinja2==3.1.2
\ No newline at end of file
+Jinja2==3.1.2
+requests==2.31.0
\ No newline at end of file
From 579a1cf82538d398a12a7ccd0f5c160e277773c1 Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 16 May 2024 09:16:23 -0400
Subject: [PATCH 07/29] fix for #26
---
.gitignore | 3 +-
strprofiler/shiny_app/clastr_api.py | 44 ++++++++++++++++++++++++++---
strprofiler/shiny_app/shiny_app.py | 1 -
3 files changed, 42 insertions(+), 6 deletions(-)
diff --git a/.gitignore b/.gitignore
index 719d172..8599a3a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,4 +153,5 @@ cython_debug/
#.idea/
.DS_Store
-strprofiler.json
\ No newline at end of file
+strprofiler.json
+testing.xlsx
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index 0082298..60ade3d 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -18,6 +18,16 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
elif query_filter == "Masters Reference":
query['algorithm'] = 3
+ if "PentaD" in query.keys():
+ query["Penta D"] = query.pop("PentaD")
+ elif "Penta_D" in query.keys():
+ query["Penta D"] = query.pop("Penta_D")
+
+ if "PentaE" in query.keys():
+ query["Penta E"] = query.pop("PentaE")
+ elif "Penta_E" in query.keys():
+ query["Penta E"] = query.pop("Penta_E")
+
query['includeAmelogenin'] = include_amelogenin
query['scoreFilter'] = score_filter
@@ -87,6 +97,11 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
merged['accession_link'] = "https://web.expasy.org/cellosaurus/" + merged['accession']
+ if "Penta D" in merged.keys():
+ merged["PentaD"] = merged.pop("Penta D")
+ if "Penta E" in merged.keys():
+ merged["PentaE"] = merged.pop("Penta E")
+
# add the query line to the top of merged, and reorder columns
query_added = pd.concat([query_df, merged]).reset_index(drop=True)
@@ -147,8 +162,8 @@ def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
"D19S433": "14",
"D21S11": "31,31.2",
"FGA": "23",
- "Penta D": "",
- "Penta E": "",
+ "PentaD": "",
+ "PentaE": "",
"TH01": "7,9.3",
"TPOX": "8",
"vWA": "18",
@@ -159,12 +174,33 @@ def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
# "Amelogenin": "X",
# "CSF1PO": "13,14",
# "D5S818": "13",
- # "D7S820": "8",
+ # "D7S820": "8,9",
# "D13S317": "12",
# "FGA": "24",
# "TH01": "8",
# "TPOX": "11",
- # "vWA": "16",
+ # "vWA": "16"
+ # }
+
+ # # stock example from https://www.cellosaurus.org/str-search/
+ # data = {"Amelogenin": "X",
+ # "CSF1PO": "11,12",
+ # "D2S1338": "19,23",
+ # "D3S1358": "15,17",
+ # "D5S818": "11,12",
+ # "D7S820": "10",
+ # "D8S1179": "10",
+ # "D13S317": "11,12",
+ # "D16S539": "11,12",
+ # "D18S51": "13",
+ # "D19S433": "14",
+ # "D21S11": "29,30",
+ # "FGA": "20,22",
+ # "PentaD": "11,14",
+ # "PentaE": "14,16",
+ # "TH01": "6,9",
+ # "TPOX": "8,9",
+ # "vWA": "17,19"
# }
r = _clastr_query(data, 'Tanabe', False, 70)
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index b977253..ff1c2c5 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -677,7 +677,6 @@ def out_result():
)
elif input.search_type() == 'Cellosaurus Database (CLASTR)':
out_df = output_df().copy()
- print(out_df)
if ('No Clastr Result' in out_df.columns) | ('Error' in out_df.columns):
return out_df
try:
From 77529be3f0844788a68b5eec75cec79557038fe2 Mon Sep 17 00:00:00 2001
From: Jared Andrews
Date: Thu, 16 May 2024 10:02:43 -0500
Subject: [PATCH 08/29] additional tweaks for #26
---
strprofiler/shiny_app/clastr_api.py | 21 +++++----------
strprofiler/utils.py | 41 ++++++++++++++++++++++-------
2 files changed, 37 insertions(+), 25 deletions(-)
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index 60ade3d..16f1c26 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -2,7 +2,7 @@
import json
import pandas as pd
from flatten_json import flatten
-
+from strprofiler.utils import _pentafix
def _clastr_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/query/"
@@ -17,16 +17,8 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
query['algorithm'] = 2
elif query_filter == "Masters Reference":
query['algorithm'] = 3
-
- if "PentaD" in query.keys():
- query["Penta D"] = query.pop("PentaD")
- elif "Penta_D" in query.keys():
- query["Penta D"] = query.pop("Penta_D")
-
- if "PentaE" in query.keys():
- query["Penta E"] = query.pop("PentaE")
- elif "Penta_E" in query.keys():
- query["Penta E"] = query.pop("Penta_E")
+
+ query = _pentafix(query, reverse = True)
query['includeAmelogenin'] = include_amelogenin
query['scoreFilter'] = score_filter
@@ -97,10 +89,7 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
merged['accession_link'] = "https://web.expasy.org/cellosaurus/" + merged['accession']
- if "Penta D" in merged.keys():
- merged["PentaD"] = merged.pop("Penta D")
- if "Penta E" in merged.keys():
- merged["PentaE"] = merged.pop("Penta E")
+ merged = _pentafix(merged)
# add the query line to the top of merged, and reorder columns
@@ -123,6 +112,8 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/batch/"
+ query = [_pentafix(item, reverse = True) for item in query]
+
if query_filter == "Tanabe":
query = [dict(item, **{'algorithm': 1}) for item in query]
elif query_filter == "Masters Query":
diff --git a/strprofiler/utils.py b/strprofiler/utils.py
index 987128f..d3e533d 100644
--- a/strprofiler/utils.py
+++ b/strprofiler/utils.py
@@ -35,17 +35,38 @@ def _clean_element(x):
return ",".join(sorted_elements)
-def _pentafix(samps_dict):
+def _pentafix(samps_dict, reverse = False):
"""Takes a dictionary of alleles and returns a dictionary with common Penta markers renamed for consistency."""
- if "Penta D" in samps_dict.keys():
- samps_dict["PentaD"] = samps_dict.pop("Penta D")
- elif "Penta_D" in samps_dict.keys():
- samps_dict["PentaD"] = samps_dict.pop("Penta_D")
-
- if "Penta E" in samps_dict.keys():
- samps_dict["PentaE"] = samps_dict.pop("Penta E")
- elif "Penta_E" in samps_dict.keys():
- samps_dict["PentaE"] = samps_dict.pop("Penta_E")
+ if not reverse:
+ if "Penta C" in samps_dict.keys():
+ samps_dict["PentaC"] = samps_dict.pop("Penta C")
+ elif "Penta_C" in samps_dict.keys():
+ samps_dict["PentaC"] = samps_dict.pop("Penta_C")
+
+ if "Penta D" in samps_dict.keys():
+ samps_dict["PentaD"] = samps_dict.pop("Penta D")
+ elif "Penta_D" in samps_dict.keys():
+ samps_dict["PentaD"] = samps_dict.pop("Penta_D")
+
+ if "Penta E" in samps_dict.keys():
+ samps_dict["PentaE"] = samps_dict.pop("Penta E")
+ elif "Penta_E" in samps_dict.keys():
+ samps_dict["PentaE"] = samps_dict.pop("Penta_E")
+ else:
+ if "PentaC" in samps_dict.keys():
+ samps_dict["Penta C"] = samps_dict.pop("PentaC")
+ elif "Penta_C" in samps_dict.keys():
+ samps_dict["Penta C"] = samps_dict.pop("Penta_C")
+
+ if "PentaD" in samps_dict.keys():
+ samps_dict["Penta D"] = samps_dict.pop("PentaD")
+ elif "Penta_D" in samps_dict.keys():
+ samps_dict["Penta D"] = samps_dict.pop("Penta_D")
+
+ if "PentaE" in samps_dict.keys():
+ samps_dict["Penta E"] = samps_dict.pop("PentaE")
+ elif "Penta_E" in samps_dict.keys():
+ samps_dict["Penta E"] = samps_dict.pop("Penta_E")
return samps_dict
From 771c133d9ac628ff546a5071a9dc1b4293949d35 Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 16 May 2024 11:26:25 -0400
Subject: [PATCH 09/29] add marker check for single query
---
strprofiler/shiny_app/clastr_api.py | 58 +++++++++++++++++++++++++++--
strprofiler/shiny_app/shiny_app.py | 19 +++++++++-
2 files changed, 73 insertions(+), 4 deletions(-)
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index 16f1c26..e3c5157 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -4,6 +4,53 @@
from flatten_json import flatten
from strprofiler.utils import _pentafix
+
+def _valid_marker_check(markers):
+
+ valid_api_markers = ['Amelogenin',
+ 'CSF1PO',
+ 'D2S1338',
+ 'D3S1358',
+ 'D5S818',
+ 'D7S820',
+ 'D8S1179',
+ 'D13S317',
+ 'D16S539',
+ 'D18S51',
+ 'D19S433',
+ 'D21S11',
+ 'FGA',
+ 'Penta D',
+ 'Penta E',
+ 'PentaD',
+ 'PentaE',
+ 'TH01',
+ 'TPOX',
+ 'vWA',
+ 'D1S1656',
+ 'D2S441',
+ 'D6S1043',
+ 'D10S1248',
+ 'D12S391',
+ 'D22S1045',
+ 'DXS101',
+ 'DYS391',
+ 'F13A01',
+ 'F13B',
+ 'FESFPS',
+ 'LPL',
+ 'Penta C',
+ 'PentaC',
+ 'SE33']
+
+ # remove extra fields, if present as keys may come from _clastr_query or other.
+ query_markers = [marker for marker in markers if marker not in ['algorithm', 'includeAmelogenin', 'scoreFilter']]
+
+ missing_markers = list(set(query_markers) - set(valid_api_markers))
+
+ return missing_markers
+
+
def _clastr_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/query/"
@@ -17,8 +64,8 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
query['algorithm'] = 2
elif query_filter == "Masters Reference":
query['algorithm'] = 3
-
- query = _pentafix(query, reverse = True)
+
+ query = _pentafix(query, reverse=True)
query['includeAmelogenin'] = include_amelogenin
query['scoreFilter'] = score_filter
@@ -112,7 +159,7 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
url = "https://www.cellosaurus.org/str-search/api/batch/"
- query = [_pentafix(item, reverse = True) for item in query]
+ query = [_pentafix(item, reverse=True) for item in query]
if query_filter == "Tanabe":
query = [dict(item, **{'algorithm': 1}) for item in query]
@@ -158,6 +205,7 @@ def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
"TH01": "7,9.3",
"TPOX": "8",
"vWA": "18",
+ "NoGoodVeryBad": "I'm not a valid marker. However, that is ok. We catch this now."
}
# # stock from https://www.cellosaurus.org/str-search/help.html#5.1
@@ -194,6 +242,10 @@ def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
# "vWA": "17,19"
# }
+ malformed_markers = _valid_marker_check(data.keys())
+
+ print(malformed_markers)
+
r = _clastr_query(data, 'Tanabe', False, 70)
print(r)
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index ff1c2c5..eae29d5 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -6,7 +6,7 @@
import strprofiler.utils as sp
from strprofiler.shiny_app.calc_functions import _single_query, _batch_query, _file_query
-from strprofiler.shiny_app.clastr_api import _clastr_query, _clastr_batch_query
+from strprofiler.shiny_app.clastr_api import _valid_marker_check, _clastr_query, _clastr_batch_query
from datetime import date
import time
@@ -645,6 +645,23 @@ def loaded_example_text():
input.query_filter_threshold(),
)
elif input.search_type() == 'Cellosaurus Database (CLASTR)':
+ malformed_markers = _valid_marker_check(query.keys())
+ if malformed_markers:
+ notify_m = ui.modal(
+ "Markers: {} are incompatible with the CLASTR query."
+ .format(str(malformed_markers)[1:-1]),
+ ui.tags.br(),
+ ui.tags.br(),
+ "These markers will not be used in the query.",
+ ui.tags.br(),
+ ui.tags.br(),
+ "See: ", ui.tags.a('CLASTR', href=str("https://www.cellosaurus.org/str-search/"), target="_blank"),
+ " for a complete list of compatible marker names",
+ title="Inompatible CLASTR Markers",
+ easy_close=True,
+ footer=ui.modal_button('Understood')
+ )
+ ui.modal_show(notify_m)
results = _clastr_query(
query,
input.query_filter(),
From 815036f51066e9e8095d48de3391a518eb68adc8 Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Thu, 16 May 2024 19:26:53 -0400
Subject: [PATCH 10/29] conditional batch options. modal notice for malformed
markers.
---
strprofiler/shiny_app/clastr_api.py | 4 +-
strprofiler/shiny_app/shiny_app.py | 126 +++++++++++++++++-----------
2 files changed, 80 insertions(+), 50 deletions(-)
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index e3c5157..fd0e9e9 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -44,7 +44,7 @@ def _valid_marker_check(markers):
'SE33']
# remove extra fields, if present as keys may come from _clastr_query or other.
- query_markers = [marker for marker in markers if marker not in ['algorithm', 'includeAmelogenin', 'scoreFilter']]
+ query_markers = [marker for marker in markers if marker not in ['algorithm', 'includeAmelogenin', 'scoreFilter', 'description']]
missing_markers = list(set(query_markers) - set(valid_api_markers))
@@ -143,8 +143,6 @@ def _clastr_query(query, query_filter, include_amelogenin, score_filter):
query_added = pd.concat([query_df, merged]).reset_index(drop=True)
query_added["bestScore"] = query_added['bestScore'].map("{0:.2f}".format).replace("nan", "")
- # print(query_added.columns)
-
if 'problem' in query_added.columns:
query_added = query_added[['accession', 'name', 'species', 'bestScore', 'accession_link', 'problem'] +
[c for c in query_added if c not in
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index eae29d5..10892d9 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -76,6 +76,25 @@ def _link_wrap(name, link, problem):
return ui.tags.a(name, href=str(link), target="_blank")
+def notify_modal(marker_list):
+ ui.modal_show(
+ ui.modal(
+ "Marker(s): {} are incompatible with the CLASTR query."
+ .format(str(marker_list)[1:-1]),
+ ui.tags.br(),
+ ui.tags.br(),
+ "The marker(s) will not be used in the query.",
+ ui.tags.br(),
+ ui.tags.br(),
+ "See: ", ui.tags.a('CLASTR', href=str("https://www.cellosaurus.org/str-search/"), target="_blank"),
+ " for a complete list of compatible marker names",
+ title="Inompatible CLASTR Markers",
+ easy_close=True,
+ footer=ui.modal_button('Understood')
+ )
+ )
+
+
# App Generation ###
def create_app(db=None):
@@ -236,33 +255,61 @@ def create_app(db=None):
{"id": "batch_sidebar"},
ui.tags.h3("Options"),
ui.tags.hr(),
+ ui.input_select(
+ "search_type_batch",
+ "Search Type",
+ ["STRprofiler Database", "Cellosaurus Database (CLASTR)"],
+ width="100%"
+ ),
ui.card(
ui.input_switch(
"score_amel_batch", "Score Amelogenin", value=False
),
- ui.input_numeric(
- "mix_threshold_batch",
- "'Mixed' Sample Threshold",
- value=3,
- width="100%",
- ),
- ui.input_numeric(
- "tan_threshold_batch",
- "Tanabe Filter Threshold",
- value=80,
- width="100%",
- ),
- ui.input_numeric(
- "mas_q_threshold_batch",
- "Masters (vs. query) Filter Threshold",
- value=80,
- width="100%",
+ ui.panel_conditional(
+ "input.search_type_batch === 'STRprofiler Database'",
+ ui.input_numeric(
+ "mix_threshold_batch",
+ "'Mixed' Sample Threshold",
+ value=3,
+ width="100%",
+ ),
+ ui.input_numeric(
+ "tan_threshold_batch",
+ "Tanabe Filter Threshold",
+ value=80,
+ width="100%",
+ ),
+ ui.input_numeric(
+ "mas_q_threshold_batch",
+ "Masters (vs. query) Filter Threshold",
+ value=80,
+ width="100%",
+ ),
+ ui.input_numeric(
+ "mas_r_threshold_batch",
+ "Masters (vs. reference) Filter Threshold",
+ value=80,
+ width="100%",
+ ),
),
- ui.input_numeric(
- "mas_r_threshold_batch",
- "Masters (vs. reference) Filter Threshold",
- value=80,
- width="100%",
+ ui.panel_conditional(
+ "input.search_type_batch === 'Cellosaurus Database (CLASTR)'",
+ ui.input_select(
+ "batch_query_filter",
+ "Similarity Score Filter",
+ choices=[
+ "Tanabe",
+ "Masters Query",
+ "Masters Reference",
+ ],
+ width="100%",
+ ),
+ ui.input_numeric(
+ "batch_query_filter_threshold",
+ "Similarity Score Filter Threshold",
+ value=80,
+ width="100%",
+ ),
),
),
ui.input_file(
@@ -272,12 +319,6 @@ def create_app(db=None):
multiple=False,
width="100%",
),
- ui.input_select(
- "search_type_batch",
- "Search Type",
- ["STRprofiler Database", "Cellosaurus Database (CLASTR)"],
- width="100%"
- ),
ui.input_action_button(
"csv_query",
"CSV Query",
@@ -645,23 +686,11 @@ def loaded_example_text():
input.query_filter_threshold(),
)
elif input.search_type() == 'Cellosaurus Database (CLASTR)':
+
malformed_markers = _valid_marker_check(query.keys())
if malformed_markers:
- notify_m = ui.modal(
- "Markers: {} are incompatible with the CLASTR query."
- .format(str(malformed_markers)[1:-1]),
- ui.tags.br(),
- ui.tags.br(),
- "These markers will not be used in the query.",
- ui.tags.br(),
- ui.tags.br(),
- "See: ", ui.tags.a('CLASTR', href=str("https://www.cellosaurus.org/str-search/"), target="_blank"),
- " for a complete list of compatible marker names",
- title="Inompatible CLASTR Markers",
- easy_close=True,
- footer=ui.modal_button('Understood')
- )
- ui.modal_show(notify_m)
+ notify_modal(malformed_markers)
+
results = _clastr_query(
query,
input.query_filter(),
@@ -741,7 +770,6 @@ def download():
@render.data_frame
def out_batch_df():
output_df.set(batch_query_results())
- print(output_df)
with reactive.isolate():
if input.search_type_batch() == 'STRprofiler Database':
try:
@@ -842,13 +870,17 @@ def batch_query_results():
)
elif input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
clastr_query = [(lambda d: d.update(description=key) or d)(val) for (key, val) in query_df.items()]
+
+ malformed_markers = _valid_marker_check(query_df[next(iter(query_df))].keys())
+ if malformed_markers:
+ notify_modal(malformed_markers)
+
results = _clastr_batch_query(
clastr_query,
- input.query_filter(),
+ input.batch_query_filter(),
input.score_amel_batch(),
- input.query_filter_threshold()
+ input.batch_query_filter_threshold()
)
- # TO DO: Change to a batch filter option set.
return results
# File input loading
From 3d5d412c49ed17cd6dd24a5676a0550796f7754c Mon Sep 17 00:00:00 2001
From: MikeWLloyd
Date: Mon, 20 May 2024 15:27:02 -0400
Subject: [PATCH 11/29] global clastr function
---
pyproject.toml | 3 +-
strprofiler/clastr.py | 237 ++++++++++++++++++++++++++++
strprofiler/shiny_app/clastr_api.py | 50 +-----
strprofiler/shiny_app/shiny_app.py | 28 ++--
strprofiler/utils.py | 54 ++++++-
tests/Example_clastr_input.csv | 4 +
6 files changed, 312 insertions(+), 64 deletions(-)
create mode 100644 strprofiler/clastr.py
create mode 100644 tests/Example_clastr_input.csv
diff --git a/pyproject.toml b/pyproject.toml
index 834af42..1c596cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ pandas = "^2.2"
rich-click = "^1.5.2"
numpy = "^1.26.3"
openpyxl = "^3.0.10"
-shiny = "^0.8.0"
+shiny = "^0.9.0"
shinyswatch = "^0.4.2"
Jinja2 = "^3.1.3"
faicons = "^0.2.2"
@@ -25,6 +25,7 @@ flatten-json = "^0.1.14"
[tool.poetry.scripts]
strprofiler = 'strprofiler.strprofiler:strprofiler'
+clastr = 'strprofiler.clastr:clastr_batch_post_request'
strprofiler-app = 'strprofiler.strprofiler:local_shiny_app'
[build-system]
diff --git a/strprofiler/clastr.py b/strprofiler/clastr.py
new file mode 100644
index 0000000..6365d2f
--- /dev/null
+++ b/strprofiler/clastr.py
@@ -0,0 +1,237 @@
+import rich_click as click
+from pathlib import Path
+from datetime import datetime
+import sys
+import pandas as pd
+import requests
+import json
+import strprofiler.utils as utils
+
+
+@click.command()
+@click.option(
+ "-sa",
+ "--search_algorithm",
+ default=1,
+ help="""Search algorithm to use in the Clastr query.
+ 1 - Tanabe, 2 - Masters (vs. query); 3 - Masters (vs. reference)""",
+ show_default=True,
+ type=int,
+)
+@click.option(
+ "-sm",
+ "--scoring_mode",
+ default=1,
+ help="""Search mode to account for missing alleles in query or reference.
+ 1 - Non-empty markers, 2 - Query markers, 3 - Reference markers.""",
+ show_default=True,
+ type=int,
+)
+@click.option(
+ "-sf",
+ "--score_filter",
+ default=80,
+ help="Minimum score to report as potential matches in summary table.",
+ show_default=True,
+ type=int,
+)
+@click.option(
+ "-mr",
+ "--max_results",
+ default=200,
+ help="Filter defining the maximum number of results to be returned.",
+ show_default=True,
+ type=int,
+)
+@click.option(
+ "-mm",
+ "--min_markers",
+ default=8,
+ help="Filter defining the minimum number of markers for matches to be reported.",
+ show_default=True,
+ type=int,
+)
+@click.option(
+ "-sm",
+ "--sample_map",
+ help="""Path to sample map in csv format for renaming.
+ First column should be sample names as given in STR file(s),
+ second should be new names to assign. No header.""",
+ type=click.Path(),
+)
+@click.option(
+ "-scol",
+ "--sample_col",
+ help="Name of sample column in STR file(s).",
+ default="Sample",
+ show_default=True,
+ type=str,
+)
+@click.option(
+ "-mcol",
+ "--marker_col",
+ help="""Name of marker column in STR file(s).
+ Only used if format is 'wide'.""",
+ default="Marker",
+ show_default=True,
+ type=str,
+)
+@click.option(
+ "-pfix",
+ "--penta_fix",
+ help="""Whether to try to harmonize PentaE/D allele spelling.""",
+ default=True,
+ show_default=True,
+ type=bool,
+)
+@click.option(
+ "-amel",
+ "--score_amel",
+ help="""Use Amelogenin for similarity scoring.""",
+ default=False,
+ show_default=True,
+ type=bool,
+)
+@click.option(
+ "-o",
+ "--output_dir",
+ default="./STRprofiler",
+ help="Path to the output directory.",
+ show_default=True,
+ type=click.Path(),
+)
+@click.argument("input_files", required=True, type=click.Path(exists=True), nargs=-1)
+@click.version_option()
+def clastr_batch_post_request(
+ input_files,
+ sample_map=None,
+ output_dir="./STRprofiler",
+ search_algorithm=1,
+ scoring_mode=1,
+ score_filter=80,
+ max_results=200,
+ min_markers=8,
+ sample_col="Sample Name",
+ marker_col="Marker",
+ penta_fix=True,
+ score_amel=False,
+):
+ """CLASTR_Query compares STR profiles to the human Cellosaurus knowledge base using the CLASTR REST API..
+
+ :param input_files: List of input STR files in csv, xlsx, tsv, or txt format.
+ :type input_files: click.Path
+
+ :param sample_map: Path to sample map in csv format for renaming.
+ First column should be sample names as given in STR file(s),
+ second should be new names to assign. No header., defaults to None
+ :type sample_map: str, optional
+
+ :param output_dir: Path to output directory, defaults to "./STRprofiler"
+ :type output_dir: str, optional
+
+ :param search_algorithm: Search algorithm to use in the Clastr query, Options: 1 - Tanabe, 2 - Masters (vs. query); 3 - Masters (vs. reference)
+ defaults to 1 (tanabe).
+ :type search_algorithm: int
+
+ :param scoring_mode: Search mode to account for missing alleles in query or reference.
+ Options: 1 - Non-empty markers, 2 - Query markers, 3 - Reference markers.
+ defaults to 1 ( Non-empty markers).
+ :type search_algorithm: int
+
+ :param score_filter: Minimum score to report as potential matches in summary table, defaults to 80
+ :type score_filter: int, optional
+
+ :param max_results: Filter defining the maximum number of results to be returned.
+ Note that in the case of conflicted cell lines, the Best and Worst versions are processed as pairs and only the best
+ score is affected by the threshold. Consequently, some Worst cases with a score below the threshold can still be present in the results.
+ defaults to 200
+ :type mix_threshold: int, optional
+
+ :param min_markers: Filter defining the minimum number of markers for matches to be reported, defaults to 8.
+ :type mix_threshold: int, optional
+
+ :param sample_col: Name of sample column in STR file(s), defaults to "Sample Name"
+ :type sample_col: str, optional
+
+ :param marker_col: Name of marker column in STR file(s).
+ Only used if format is 'wide', defaults to "Marker"
+ :type marker_col: str, optional
+
+ :param penta_fix: Whether to try to harmonize PentaE/D allele spelling, defaults to True
+ :type penta_fix: bool, optional
+
+ :param score_amel: Use Amelogenin for similarity scoring, defaults to False
+ :type score_amel: bool, optional
+ """
+
+ # Make output directory and open file for logging.
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
+ now = datetime.now()
+ dt_string = now.strftime("%Y%m%d.%H_%M_%S")
+ log_file = open(Path(output_dir, "strprofiler.clastrQuery." + dt_string + ".log"), "w")
+
+ print("Search algorithm: " + str(search_algorithm), file=log_file)
+ print("Scoring mode: " + str(scoring_mode), file=log_file)
+ print("Score filter: " + str(marker_col), file=log_file)
+ print("Max results: " + str(max_results), file=log_file)
+ print("Min markers: " + str(min_markers), file=log_file)
+ print("Sample map: " + str(sample_map), file=log_file)
+ print("Sample column: " + str(sample_col), file=log_file)
+ print("Marker column: " + str(marker_col), file=log_file)
+ print("Penta fix: " + str(penta_fix), file=log_file)
+ print("Use amelogenin for scoring: " + str(score_amel) + "\n", file=log_file)
+ print("Full command:", file=log_file)
+
+ print(" ".join(sys.argv) + "\n", file=log_file)
+
+ # Check for sample map.
+ if sample_map is not None:
+ sample_map = pd.read_csv(sample_map, header=None, encoding="unicode_escape")
+
+ # Data ingress.
+ query = utils.str_ingress(
+ paths=input_files,
+ sample_col=sample_col,
+ marker_col=marker_col,
+ sample_map=sample_map,
+ penta_fix=penta_fix,
+ ).to_dict(orient="index")
+
+ clastr_query = [(lambda d: d.update(description=key) or d)(val) for (key, val) in query.items()]
+
+ malformed_markers = utils.validate_api_markers(next(iter(clastr_query)).keys())
+ if malformed_markers:
+ print("Marker(s): {} are incompatible with the CLASTR query. The marker(s) will not be used in the query."
+ .format(str(malformed_markers)[1:-1]), file=log_file)
+ print("See: https://www.cellosaurus.org/str-search/ for a complete list of compatible marker names", file=log_file)
+
+ url = "https://www.cellosaurus.org/str-search/api/batch/"
+
+ clastr_query = [utils._pentafix(item, reverse=True) for item in clastr_query]
+ clastr_query = [dict(item, **{'algorithm': search_algorithm}) for item in clastr_query]
+ clastr_query = [dict(item, **{'scoringMode': scoring_mode}) for item in clastr_query]
+ clastr_query = [dict(item, **{'scoreFilter': score_filter}) for item in clastr_query]
+ clastr_query = [dict(item, **{'includeAmelogenin': score_amel}) for item in clastr_query]
+ clastr_query = [dict(item, **{'minMarkers': min_markers}) for item in clastr_query]
+ clastr_query = [dict(item, **{'maxResults': max_results}) for item in clastr_query]
+ clastr_query = [dict(item, **{'outputFormat': 'xlsx'}) for item in clastr_query]
+
+ print("Querying CLASTR API at: ", url, file=log_file)
+ r = requests.post(url, data=json.dumps(clastr_query))
+
+ try:
+ r.raise_for_status()
+ except requests.exceptions.HTTPError as e:
+ print("Request failed with error: '", e, "'", file=log_file)
+ print("Request failed with error: '", e, "'")
+ return ''
+
+ print("Response from query: ", r.status_code, file=log_file)
+
+ with open(Path(output_dir, "strprofiler.clastrQueryResult." + dt_string + ".xlsx"), "wb") as fd:
+ for chunk in r.iter_content(chunk_size=128):
+ fd.write(chunk)
+
+ print("Results saved: ", Path(output_dir, "strprofiler.clastrQueryResult." + dt_string + ".xlsx"), file=log_file)
+
+ log_file.close()
diff --git a/strprofiler/shiny_app/clastr_api.py b/strprofiler/shiny_app/clastr_api.py
index fd0e9e9..33047b2 100644
--- a/strprofiler/shiny_app/clastr_api.py
+++ b/strprofiler/shiny_app/clastr_api.py
@@ -2,53 +2,7 @@
import json
import pandas as pd
from flatten_json import flatten
-from strprofiler.utils import _pentafix
-
-
-def _valid_marker_check(markers):
-
- valid_api_markers = ['Amelogenin',
- 'CSF1PO',
- 'D2S1338',
- 'D3S1358',
- 'D5S818',
- 'D7S820',
- 'D8S1179',
- 'D13S317',
- 'D16S539',
- 'D18S51',
- 'D19S433',
- 'D21S11',
- 'FGA',
- 'Penta D',
- 'Penta E',
- 'PentaD',
- 'PentaE',
- 'TH01',
- 'TPOX',
- 'vWA',
- 'D1S1656',
- 'D2S441',
- 'D6S1043',
- 'D10S1248',
- 'D12S391',
- 'D22S1045',
- 'DXS101',
- 'DYS391',
- 'F13A01',
- 'F13B',
- 'FESFPS',
- 'LPL',
- 'Penta C',
- 'PentaC',
- 'SE33']
-
- # remove extra fields, if present as keys may come from _clastr_query or other.
- query_markers = [marker for marker in markers if marker not in ['algorithm', 'includeAmelogenin', 'scoreFilter', 'description']]
-
- missing_markers = list(set(query_markers) - set(valid_api_markers))
-
- return missing_markers
+from strprofiler.utils import _pentafix, validate_api_markers
def _clastr_query(query, query_filter, include_amelogenin, score_filter):
@@ -240,7 +194,7 @@ def _clastr_batch_query(query, query_filter, include_amelogenin, score_filter):
# "vWA": "17,19"
# }
- malformed_markers = _valid_marker_check(data.keys())
+ malformed_markers = validate_api_markers(data.keys())
print(malformed_markers)
diff --git a/strprofiler/shiny_app/shiny_app.py b/strprofiler/shiny_app/shiny_app.py
index 10892d9..3d5c145 100644
--- a/strprofiler/shiny_app/shiny_app.py
+++ b/strprofiler/shiny_app/shiny_app.py
@@ -4,9 +4,9 @@
import pandas as pd
from faicons import icon_svg
-import strprofiler.utils as sp
+import strprofiler.utils as utils
from strprofiler.shiny_app.calc_functions import _single_query, _batch_query, _file_query
-from strprofiler.shiny_app.clastr_api import _valid_marker_check, _clastr_query, _clastr_batch_query
+from strprofiler.shiny_app.clastr_api import _clastr_query, _clastr_batch_query
from datetime import date
import time
@@ -27,7 +27,7 @@ def database_load(file):
Exception: If the file fails to load or if sample ID names are duplicated.
"""
try:
- str_database = sp.str_ingress(
+ str_database = utils.str_ingress(
[file], # expects list
sample_col="Sample",
marker_col="Marker",
@@ -211,7 +211,7 @@ def create_app(db=None):
width="90%"
),
ui.tooltip(
- ui.input_action_button(
+ ui.input_task_button(
"search",
"Search",
class_="btn-success",
@@ -238,6 +238,7 @@ def create_app(db=None):
ui.column(3, ui.tags.h3("Results")),
ui.column(1, ui.p("")),
),
+ # TO DO: Try loading/thinking spinners.
ui.column(
12,
{"id": "res_card"},
@@ -319,7 +320,7 @@ def create_app(db=None):
multiple=False,
width="100%",
),
- ui.input_action_button(
+ ui.input_task_button(
"csv_query",
"CSV Query",
class_="btn-primary",
@@ -673,7 +674,7 @@ def loaded_example_text():
where="afterEnd",
)
res_click.set(1)
- thinking = ui.notification_show("Message: API Query Running.", duration=None)
+
# isolate input.search_type to prevent trigger when options change.
with reactive.isolate():
if input.search_type() == 'STRprofiler Database':
@@ -687,7 +688,7 @@ def loaded_example_text():
)
elif input.search_type() == 'Cellosaurus Database (CLASTR)':
- malformed_markers = _valid_marker_check(query.keys())
+ malformed_markers = utils.validate_api_markers(query.keys())
if malformed_markers:
notify_modal(malformed_markers)
@@ -697,7 +698,8 @@ def loaded_example_text():
input.score_amel_query(),
input.query_filter_threshold()
)
- ui.notification_remove(thinking)
+ # TO DO: Does this need to be async?
+
return results
@output
@@ -805,7 +807,7 @@ def batch_query_results():
ui.remove_ui("#inserted-downloader2")
return pd.DataFrame({"": []})
try:
- query_df = sp.str_ingress(
+ query_df = utils.str_ingress(
[file[0]["datapath"]],
sample_col="Sample",
marker_col="Marker",
@@ -850,6 +852,7 @@ def batch_query_results():
{"id": "inserted-downloader2"},
ui.download_button(
"download2", "Download XLSX", width="25%", class_="btn-primary"
+ # TO DO: Adjust spacing on 'results' section. XLSX button is too far down.
),
),
selector="#res_card_batch",
@@ -870,8 +873,7 @@ def batch_query_results():
)
elif input.search_type_batch() == 'Cellosaurus Database (CLASTR)':
clastr_query = [(lambda d: d.update(description=key) or d)(val) for (key, val) in query_df.items()]
-
- malformed_markers = _valid_marker_check(query_df[next(iter(query_df))].keys())
+ malformed_markers = utils.validate_api_markers(query_df[next(iter(query_df))].keys())
if malformed_markers:
notify_modal(malformed_markers)
@@ -881,6 +883,8 @@ def batch_query_results():
input.score_amel_batch(),
input.batch_query_filter_threshold()
)
+ # TO DO: Does this need to be async?
+
return results
# File input loading
@@ -935,7 +939,7 @@ def file_query_results():
if file is None:
ui.remove_ui("#inserted-downloader3")
return pd.DataFrame({"": []})
- query_df = sp.str_ingress(
+ query_df = utils.str_ingress(
[file[0]["datapath"]],
sample_col="Sample",
marker_col="Marker",
diff --git a/strprofiler/utils.py b/strprofiler/utils.py
index d3e533d..b137a50 100644
--- a/strprofiler/utils.py
+++ b/strprofiler/utils.py
@@ -35,14 +35,14 @@ def _clean_element(x):
return ",".join(sorted_elements)
-def _pentafix(samps_dict, reverse = False):
+def _pentafix(samps_dict, reverse=False):
"""Takes a dictionary of alleles and returns a dictionary with common Penta markers renamed for consistency."""
if not reverse:
if "Penta C" in samps_dict.keys():
samps_dict["PentaC"] = samps_dict.pop("Penta C")
elif "Penta_C" in samps_dict.keys():
samps_dict["PentaC"] = samps_dict.pop("Penta_C")
-
+
if "Penta D" in samps_dict.keys():
samps_dict["PentaD"] = samps_dict.pop("Penta D")
elif "Penta_D" in samps_dict.keys():
@@ -140,7 +140,8 @@ def _make_html(dataframe: pd.DataFrame):
{table_html}
-
+