Skip to content

Commit 813a61a

Browse files
Merge pull request #63 from databio/dev
release 0.3.0
2 parents 4298e4b + 4301c3d commit 813a61a

File tree

24 files changed

+774
-309
lines changed

24 files changed

+774
-309
lines changed

.github/workflows/run-pytest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
runs-on: ${{ matrix.os }}
1313
strategy:
1414
matrix:
15-
python-version: ["3.8", "3.11"]
15+
python-version: ["3.9", "3.11"]
1616
os: [ubuntu-latest]
1717

1818
steps:

.gitignore

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,4 +139,15 @@ openSignalMatrix
139139
out2023/*
140140

141141
# test data
142-
test/test_data/*
142+
test/test_data/*
143+
/scripts/bedclassifier_tuning/results/
144+
/scripts/bedclassifier_tuning/data/
145+
genome_config.yaml
146+
alias/hg19/fasta/default/hg19.chrom.sizes
147+
alias/hg19/fasta/default/hg19.fa
148+
alias/hg19/fasta/default/hg19.fa.fai
149+
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
150+
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
151+
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
152+
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
153+
test/Untitled.ipynb

MANIFEST.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ include bedboss/bedqc/*
88
include bedboss/qdrant_index/*
99
include bedboss/bedbuncher/*
1010
include bedboss/bedbuncher/tools/*
11-
include bedboss/bedclassifier/*
11+
include bedboss/bedclassifier/*
12+
include bedboss/tokens/*

bedboss/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1"
1+
__version__ = "0.3.0"

bedboss/bedboss.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pephubclient.helpers import is_registry_path, MessageHandler as m
1313
from bbconf.bbagent import BedBaseAgent
1414
from bbconf.models.base_models import FileModel
15+
from bbconf.const import DEFAULT_LICENSE
1516

1617
from bedboss.bedstat.bedstat import bedstat
1718
from bedboss.bedmaker.bedmaker import make_all
@@ -55,6 +56,7 @@ def run_all(
5556
genome: str,
5657
bedbase_config: Union[str, bbconf.BedBaseAgent],
5758
name: str = None,
59+
license_id: str = DEFAULT_LICENSE,
5860
rfg_config: str = None,
5961
narrowpeak: bool = False,
6062
check_qc: bool = True,
@@ -67,6 +69,10 @@ def run_all(
6769
upload_qdrant: bool = False,
6870
upload_s3: bool = False,
6971
upload_pephub: bool = False,
72+
# Universes
73+
universe: bool = False,
74+
universe_method: str = None,
75+
universe_bedset: str = None,
7076
pm: pypiper.PipelineManager = None,
7177
) -> str:
7278
"""
@@ -78,6 +84,7 @@ def run_all(
7884
:param str genome: genome_assembly of the sample. [required] options: (hg19, hg38, mm10) # TODO: add more
7985
:param str name: name of the sample (human-readable name, e.g. "H3K27ac in liver") [optional]
8086
:param Union[str, bbconf.BedBaseConf] bedbase_config: The path to the bedbase configuration file, or bbconf object.
87+
:param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
8188
:param str rfg_config: file path to the genome config file [optional]
8289
:param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
8390
(transcription factor implies narrow, histone mark implies broad peaks) [optional]
@@ -92,6 +99,10 @@ def run_all(
9299
:param bool upload_qdrant: whether to skip qdrant indexing
93100
:param bool upload_s3: whether to upload to s3
94101
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
102+
103+
:param bool universe: whether to add the sample as the universe [Default: False]
104+
:param str universe_method: method used to create the universe [Default: None]
105+
:param str universe_bedset: bedset identifier for the universe [Default: None]
95106
:param pypiper.PipelineManager pm: pypiper object
96107
:return str bed_digest: bed digest
97108
"""
@@ -189,6 +200,7 @@ def run_all(
189200
plots=plots.model_dump(exclude_unset=True),
190201
files=files.model_dump(exclude_unset=True),
191202
classification=classification.model_dump(exclude_unset=True),
203+
license_id=license_id,
192204
upload_qdrant=upload_qdrant,
193205
upload_pephub=upload_pephub,
194206
upload_s3=upload_s3,
@@ -197,6 +209,13 @@ def run_all(
197209
nofail=True,
198210
)
199211

212+
if universe:
213+
bbagent.bed.add_universe(
214+
bedfile_id=bed_metadata.bed_digest,
215+
bedset_id=universe_bedset,
216+
construct_method=universe_method,
217+
)
218+
200219
if stop_pipeline:
201220
pm.stop_pipeline()
202221

@@ -211,7 +230,9 @@ def insert_pep(
211230
bedset_id: str = None,
212231
bedset_name: str = None,
213232
rfg_config: str = None,
214-
create_bedset: bool = True,
233+
license_id: str = DEFAULT_LICENSE,
234+
create_bedset: bool = False,
235+
bedset_heavy: bool = False,
215236
check_qc: bool = True,
216237
ensdb: str = None,
217238
just_db_commit: bool = False,
@@ -232,7 +253,10 @@ def insert_pep(
232253
:param str bedset_id: bedset identifier
233254
:param str bedset_name: bedset name
234255
:param str rfg_config: path to the genome config file (refgenie)
256+
:param str license_id: license identifier [optional] (default: "DUO:0000042").; Find All licenses in bedbase.org
257+
This license will be used for bedfiles where license is not provided in PEP file
235258
:param bool create_bedset: whether to create bedset
259+
:param bool bedset_heavy: whether to use heavy processing (add all columns to the database)
236260
:param bool upload_qdrant: whether to upload bedfiles to qdrant
237261
:param bool check_qc: whether to run quality control during badmaking
238262
:param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata
@@ -279,6 +303,7 @@ def insert_pep(
279303
genome=pep_sample.genome,
280304
name=pep_sample.sample_name,
281305
bedbase_config=bbagent,
306+
license_id=pep_sample.get("license_id") or license_id,
282307
narrowpeak=is_narrow_peak,
283308
chrom_sizes=pep_sample.get("chrom_sizes"),
284309
open_signal_matrix=pep_sample.get("open_signal_matrix"),
@@ -292,8 +317,12 @@ def insert_pep(
292317
upload_qdrant=upload_qdrant,
293318
upload_s3=upload_s3,
294319
upload_pephub=upload_pephub,
320+
universe=pep_sample.get("universe"),
321+
universe_method=pep_sample.get("universe_method"),
322+
universe_bedset=pep_sample.get("universe_bedset"),
295323
pm=pm,
296324
)
325+
297326
processed_ids.append(bed_id)
298327
except BedBossException as e:
299328
_LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
@@ -308,7 +337,7 @@ def insert_pep(
308337
name=bedset_name or pep.name,
309338
output_folder=output_folder,
310339
description=pep.description,
311-
heavy=True,
340+
heavy=bedset_heavy,
312341
upload_pephub=upload_pephub,
313342
upload_s3=upload_s3,
314343
no_fail=no_fail,

bedboss/bedbuncher/bedbuncher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def run_bedbuncher(
158158
description=description,
159159
upload_pephub=upload_pephub,
160160
upload_s3=upload_s3,
161-
plots=plots.model_dump(exclude_none=True, exclude_unset=True),
161+
plots=plots.model_dump(exclude_none=True, exclude_unset=True) if plots else {},
162162
local_path=output_folder,
163163
no_fail=no_fail,
164164
overwrite=force_overwrite,

bedboss/bedclassifier/bedclassifier.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:
3838

3939
max_rows = 5
4040
row_count = 0
41+
4142
while row_count <= max_rows:
4243
try:
4344
df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
4445
if row_count > 0:
4546
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
4647
break
48+
except UnicodeDecodeError as e:
49+
try:
50+
df = pd.read_csv(
51+
bed,
52+
sep="\t",
53+
header=None,
54+
nrows=4,
55+
skiprows=row_count,
56+
encoding="utf-16",
57+
)
58+
if row_count > 0:
59+
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
60+
break
61+
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
62+
if row_count <= max_rows:
63+
row_count += 1
64+
else:
65+
if no_fail:
66+
_LOGGER.warning(
67+
f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
68+
)
69+
return "unknown_bedtype", "unknown_bedtype"
70+
else:
71+
raise BedTypeException(
72+
reason=f"Bed type could not be determined due to CSV parse error {e}"
73+
)
4774
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
4875
if row_count <= max_rows:
4976
row_count += 1

bedboss/bedmaker/bedmaker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def make_bed(
169169
if input_type not in [member.value for member in InputTypes]:
170170
raise BedBossException(
171171
f"Invalid input type: {input_type}. "
172-
f"Supported types: {', '.join(InputTypes.__members__.values())}"
172+
f"Supported types: {', '.join([k.value for k in InputTypes])}"
173173
)
174174

175175
if not pm:
@@ -405,7 +405,7 @@ def make_all(
405405

406406
return BedMakerOutput(
407407
bed_file=output_bed,
408-
bigbed_file=output_bigbed,
408+
bigbed_file=os.path.abspath(output_bigbed) if output_bigbed else None,
409409
bed_digest=RegionSet(output_bed).identifier,
410410
bed_type=bed_type,
411411
bed_format=bed_format,

bedboss/bedmaker/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from bedboss.const import (
1717
REFGENIE_ENV_VAR,
18+
DEFAULT_REFGENIE_PATH,
1819
)
1920

2021
_LOGGER = logging.getLogger("bedboss")
@@ -66,7 +67,7 @@ def get_rgc(rfg_config: Union[str, Path] = None) -> RGC:
6667
"""
6768
if not rfg_config:
6869
_LOGGER.info("Creating refgenie genome config file...")
69-
cwd = os.getenv(REFGENIE_ENV_VAR, os.getcwd())
70+
cwd = os.getenv(REFGENIE_ENV_VAR, DEFAULT_REFGENIE_PATH)
7071
rfg_config = os.path.join(cwd, "genome_config.yaml")
7172

7273
# get path to the genome config; from arg or env var if arg not provided

bedboss/bedstat/bedstat.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
OPEN_SIGNAL_URL,
1818
)
1919
from bedboss.utils import download_file
20-
from bedboss.exceptions import OpenSignalMatrixException
20+
from bedboss.exceptions import OpenSignalMatrixException, BedBossException
2121

2222

2323
_LOGGER = logging.getLogger("bedboss")
@@ -158,7 +158,11 @@ def bedstat(
158158
f"--ensdb={ensdb} --digest={bed_digest}"
159159
)
160160

161-
pm.run(cmd=command, target=json_file_path)
161+
try:
162+
pm.run(cmd=command, target=json_file_path)
163+
except Exception as e:
164+
_LOGGER.error(f"Pipeline failed: {e}")
165+
raise BedBossException(f"Pipeline failed: {e}")
162166

163167
data = {}
164168
if os.path.exists(json_file_path):

0 commit comments

Comments
 (0)