Skip to content

Commit 15be3e3

Browse files
Merge pull request #76 from databio/dev
Release 0.4.1
2 parents 5a21300 + 6df0600 commit 15be3e3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+44090
-85
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,8 @@ data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e27
151151
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
152152
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
153153
test/Untitled.ipynb
154+
/scripts/ref_genome_validating/genome_folder/
155+
/scripts/ref_genome_validating/data/
156+
/scripts/ref_genome_validating/results/
157+
/scripts/ref_genome_validating/stats_results/results.yaml
158+
/scripts/ref_genome_validating/stats_results/results_backup_11sep2024.yaml

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
repos:
22
# Using this mirror lets us use mypyc-compiled black, which is about 2x faster
33
- repo: https://github.com/psf/black-pre-commit-mirror
4-
rev: 24.1.1
4+
rev: 24.8.0
55
hooks:
66
- id: black
77
# It is recommended to specify the latest version of Python

MANIFEST.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,8 @@ include bedboss/qdrant_index/*
99
include bedboss/bedbuncher/*
1010
include bedboss/bedbuncher/tools/*
1111
include bedboss/bedclassifier/*
12+
include bedboss/refgenome_validator/*
1213
include bedboss/tokens/*
13-
include bedboss/bbuploader/*
14+
include bedboss/tokens/*
15+
include bedboss/bbuploader/*
16+
include bedboss/refgenome_validator/chrom_sizes/*

bedboss/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.0"
1+
__version__ = "0.5.0"

bedboss/bbuploader/cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ def upload_all(
4141
True, help="Run skipped projects. [Default: False]"
4242
),
4343
run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"),
44+
standardize_pep: bool = typer.Option(
45+
False, help="Standardize pep with BEDMESS. [Default: False]"
46+
),
4447
):
4548
from .main import upload_all as upload_all_function
4649

@@ -57,6 +60,7 @@ def upload_all(
5760
rerun=rerun,
5861
run_skipped=run_skipped,
5962
run_failed=run_failed,
63+
standardize_pep=standardize_pep,
6064
)
6165

6266

@@ -79,6 +83,9 @@ def upload_gse(
7983
True, help="Run skipped projects. [Default: False]"
8084
),
8185
run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"),
86+
standardize_pep: bool = typer.Option(
87+
False, help="Standardize pep with BEDMESS. [Default: False]"
88+
),
8289
):
8390
from .main import upload_gse as upload_gse_function
8491

@@ -91,6 +98,7 @@ def upload_gse(
9198
rerun=rerun,
9299
run_skipped=run_skipped,
93100
run_failed=run_failed,
101+
standardize_pep=standardize_pep,
94102
)
95103

96104

bedboss/bbuploader/main.py

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from bbconf import BedBaseAgent
77
from bbconf.db_utils import GeoGseStatus, GeoGsmStatus
88
from pephubclient import PEPHubClient
9+
from pephubclient.helpers import MessageHandler
910
from pephubclient.models import SearchReturnModel
1011
from sqlalchemy import and_, select
1112
from sqlalchemy.orm import Session
@@ -19,7 +20,7 @@
1920
from bedboss.bedboss import run_all
2021
from bedboss.bedbuncher.bedbuncher import run_bedbuncher
2122
from bedboss.exceptions import BedBossException
22-
from bedboss.utils import standardize_genome_name
23+
from bedboss.utils import standardize_genome_name, standardize_pep as pep_standardizer
2324

2425
_LOGGER = logging.getLogger(PKG_NAME)
2526
_LOGGER.setLevel(logging.DEBUG)
@@ -36,8 +37,9 @@ def upload_all(
3637
genome: str = None,
3738
create_bedset: bool = True,
3839
rerun: bool = False,
39-
run_skipped=False,
40-
run_failed=True,
40+
run_skipped: bool = False,
41+
run_failed: bool = True,
42+
standardize_pep: bool = False,
4143
):
4244
"""
4345
This is main function that is responsible for processing bed files from PEPHub.
@@ -54,6 +56,7 @@ def upload_all(
5456
:param rerun: rerun processing of the series
5557
:param run_skipped: rerun files that were skipped
5658
:param run_failed: rerun failed files
59+
:param standardize_pep: standardize pep metadata using BEDMS
5760
"""
5861

5962
phc = PEPHubClient()
@@ -76,10 +79,14 @@ def upload_all(
7679
_LOGGER.info(f"found {pep_annotation_list.count} projects")
7780

7881
count = 0
82+
total_projects = len(pep_annotation_list.results)
7983
for gse_pep in pep_annotation_list.results:
80-
84+
count += 1
8185
with Session(bbagent.config.db_engine.engine) as session:
82-
_LOGGER.info(f"Processing: '{gse_pep.name}'")
86+
MessageHandler.print_success(f"{'##' * 30}")
87+
MessageHandler.print_success(
88+
f"#### Processing: '{gse_pep.name}'. #### Processing {count} / {total_projects}. ####"
89+
)
8390

8491
gse_status = session.scalar(
8592
select(GeoGseStatus).where(GeoGseStatus.gse == gse_pep.name)
@@ -122,6 +129,8 @@ def upload_all(
122129
genome=genome,
123130
sa_session=session,
124131
gse_status_sa_model=gse_status,
132+
standardize_pep=standardize_pep,
133+
rerun=rerun,
125134
)
126135
except Exception as err:
127136
_LOGGER.error(
@@ -134,7 +143,6 @@ def upload_all(
134143
status_parser(gse_status, upload_result)
135144
session.commit()
136145

137-
count += 1
138146
if count >= download_limit:
139147
break
140148

@@ -244,8 +252,9 @@ def upload_gse(
244252
create_bedset: bool = True,
245253
genome: str = None,
246254
rerun: bool = False,
247-
run_skipped=False,
248-
run_failed=True,
255+
run_skipped: bool = False,
256+
run_failed: bool = True,
257+
standardize_pep: bool = False,
249258
):
250259
"""
251260
Upload bed files from GEO series to BedBase
@@ -258,6 +267,7 @@ def upload_gse(
258267
:param rerun: rerun processing of the series
259268
:param run_skipped: rerun files that were skipped
260269
:param run_failed: rerun failed files
270+
:param standardize_pep: standardize pep metadata using BEDMS
261271
262272
:return: None
263273
"""
@@ -302,6 +312,8 @@ def upload_gse(
302312
genome=genome,
303313
sa_session=session,
304314
gse_status_sa_model=gse_status,
315+
standardize_pep=standardize_pep,
316+
rerun=rerun,
305317
)
306318
except Exception as e:
307319
_LOGGER.error(f"Processing of '{gse}' failed with error: {e}")
@@ -347,6 +359,8 @@ def _upload_gse(
347359
genome: str = None,
348360
sa_session: Session = None,
349361
gse_status_sa_model: GeoGseStatus = None,
362+
standardize_pep: bool = False,
363+
rerun: bool = False,
350364
) -> ProjectProcessingStatus:
351365
"""
352366
Upload bed files from GEO series to BedBase
@@ -358,6 +372,8 @@ def _upload_gse(
358372
:param genome: reference genome to upload to database. If None, all genomes will be processed
359373
:param sa_session: opened session to the database
360374
:param gse_status_sa_model: sqlalchemy model for project status
375+
:param standardize_pep: standardize pep metadata using BEDMS
376+
:param rerun: force overwrite data in the database
361377
362378
:return: None
363379
"""
@@ -371,12 +387,14 @@ def _upload_gse(
371387

372388
project = phc.load_project(f"bedbase/{gse}:{DEFAULT_GEO_TAG}")
373389

390+
if standardize_pep:
391+
project = pep_standardizer(project)
392+
374393
project_status = ProjectProcessingStatus(number_of_samples=len(project.samples))
375394
uploaded_files = []
376395
gse_status_sa_model.number_of_files = len(project.samples)
377396
sa_session.commit()
378397
for project_sample in project.samples:
379-
380398
sample_gsm = project_sample.get("sample_geo_accession", "").lower()
381399

382400
required_metadata = process_pep_sample(
@@ -435,7 +453,7 @@ def _upload_gse(
435453
upload_pephub=True,
436454
upload_s3=True,
437455
upload_qdrant=True,
438-
force_overwrite=False,
456+
force_overwrite=rerun,
439457
)
440458
uploaded_files.append(file_digest)
441459
sample_status.status = STATUS.SUCCESS
@@ -449,7 +467,6 @@ def _upload_gse(
449467
sa_session.commit()
450468

451469
if create_bedset and uploaded_files:
452-
453470
_LOGGER.info(f"Creating bedset for: '{gse}'")
454471
run_bedbuncher(
455472
bedbase_config=bedbase_config,
@@ -470,41 +487,3 @@ def _upload_gse(
470487

471488
_LOGGER.info(f"Processing of '{gse}' is finished with success!")
472489
return project_status
473-
474-
475-
#
476-
# if __name__ == "__main__":
477-
# # upload_gse(
478-
# # # gse="gse246900",
479-
# # # gse="gse247593",
480-
# # # gse="gse241222",
481-
# # #gse="gse266130",
482-
# # gse="gse99178",
483-
# # # gse="gse240325", # TODO: check if qc works
484-
# # # gse="gse229592", # mice
485-
# # bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
486-
# # outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
487-
# # # genome="HG38",
488-
# # # rerun=True,
489-
# # run_failed=True,
490-
# # run_skipped=True,
491-
# # )
492-
# upload_all(
493-
# bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
494-
# outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
495-
# start_date="2024/01/21",
496-
# end_date="2024/08/28",
497-
# search_limit=2,
498-
# search_offset=0,
499-
# genome="GRCh38",
500-
# rerun=True,
501-
# )
502-
# # upload_all(
503-
# # bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
504-
# # outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
505-
# # start_date="2024/01/01",
506-
# # # end_date="2024/03/28",
507-
# # search_limit=200,
508-
# # search_offset=0,
509-
# # genome="GRCh38",
510-
# # )

bedboss/bbuploader/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55

66
class BedBossMetadata(BaseModel):
77
genome: str = Field(None, alias="ref_genome")
8-
organism: Optional[str] = Field("", alias="sample_organism_ch1")
8+
species_name: Optional[str] = Field("", alias="sample_organism_ch1")
99
species_id: Optional[str] = Field("", alias="sample_taxid_ch1")
1010
cell_type: Optional[str] = ""
1111
cell_line: Optional[str] = ""
1212
genotype: Optional[str] = ""
13-
exp_protocol: Optional[str] = Field("", alias="sample_library_strategy")
13+
assay: Optional[str] = Field("", alias="sample_library_strategy")
1414
library_source: Optional[str] = Field("", alias="sample_library_source")
1515
target: Optional[str] = Field("")
1616
antibody: Optional[str] = Field("", alias="chip_antibody")

bedboss/bedboss.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,26 @@
1414
from pephubclient.helpers import MessageHandler as m
1515
from pephubclient.helpers import is_registry_path
1616

17-
from bedboss._version import __version__
1817
from bedboss.bedbuncher import run_bedbuncher
1918
from bedboss.bedmaker.bedmaker import make_all
2019
from bedboss.bedstat.bedstat import bedstat
2120
from bedboss.const import BEDBOSS_PEP_SCHEMA_PATH, PKG_NAME
22-
from bedboss.exceptions import BedBossException
2321
from bedboss.models import (
2422
BedClassificationUpload,
2523
FilesUpload,
2624
PlotsUpload,
2725
StatsUpload,
2826
)
29-
from bedboss.utils import get_genome_digest, standardize_genome_name
27+
from bedboss.refgenome_validator.main import ReferenceValidator
28+
29+
from bedboss.utils import (
30+
standardize_genome_name,
31+
get_genome_digest,
32+
standardize_pep as pep_standardizer,
33+
)
34+
from bedboss.exceptions import BedBossException
35+
from bedboss._version import __version__
36+
3037

3138
_LOGGER = logging.getLogger(PKG_NAME)
3239

@@ -54,6 +61,7 @@ def run_all(
5461
rfg_config: str = None,
5562
narrowpeak: bool = False,
5663
check_qc: bool = True,
64+
validate_reference: bool = True,
5765
chrom_sizes: str = None,
5866
open_signal_matrix: str = None,
5967
ensdb: str = None,
@@ -83,6 +91,7 @@ def run_all(
8391
:param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
8492
(transcription factor implies narrow, histone mark implies broad peaks) [optional]
8593
:param bool check_qc: set True to run quality control during badmaking [optional] (default: True)
94+
:param bool validate_reference: set True to run genome reference validator
8695
:param str chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional]
8796
:param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
8897
:param dict other_metadata: a dict containing all attributes from the sample
@@ -137,7 +146,7 @@ def run_all(
137146
pm=pm,
138147
)
139148
if not other_metadata:
140-
other_metadata = {}
149+
other_metadata = {"sample_name": name}
141150

142151
statistics_dict = bedstat(
143152
bedfile=bed_metadata.bed_file,
@@ -187,13 +196,22 @@ def run_all(
187196
bed_format=bed_metadata.bed_format.value,
188197
)
189198

199+
if validate_reference:
200+
_LOGGER.info("Validating reference genome")
201+
ref_valid_stats = ReferenceValidator().determine_compatibility(
202+
bedfile=bed_metadata.bed_file, concise=True
203+
)
204+
else:
205+
ref_valid_stats = None
206+
190207
bbagent.bed.add(
191208
identifier=bed_metadata.bed_digest,
192209
stats=stats.model_dump(exclude_unset=True),
193210
metadata=other_metadata,
194211
plots=plots.model_dump(exclude_unset=True),
195212
files=files.model_dump(exclude_unset=True),
196213
classification=classification.model_dump(exclude_unset=True),
214+
ref_validation=ref_valid_stats,
197215
license_id=license_id,
198216
upload_qdrant=upload_qdrant,
199217
upload_pephub=upload_pephub,
@@ -235,6 +253,7 @@ def insert_pep(
235253
upload_pephub: bool = False,
236254
upload_qdrant: bool = False,
237255
no_fail: bool = False,
256+
standardize_pep: bool = False,
238257
pm: pypiper.PipelineManager = None,
239258
) -> None:
240259
"""
@@ -260,6 +279,7 @@ def insert_pep(
260279
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
261280
:param bool upload_qdrant: whether to execute qdrant indexing
262281
:param bool no_fail: whether to raise an error if bedset was not added to the database
282+
:param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False)
263283
:param pypiper.PipelineManager pm: pypiper object
264284
:return: None
265285
"""
@@ -276,6 +296,9 @@ def insert_pep(
276296
else:
277297
raise BedBossException("Incorrect pep type. Exiting...")
278298

299+
if standardize_pep:
300+
pep = pep_standardizer(pep)
301+
279302
bbagent = BedBaseAgent(bedbase_config)
280303

281304
validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH)

bedboss/bedbuncher/tools/bedsetStat.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ if (is.null(opt$json)) {
6767
#'
6868
#' Calculates how many regionsets (bedfiles) overlap at least said percentage
6969
#' of regions included in the universe. The universe is considered a union of
70-
#' all regionsets (bedfiles) in the colection of
70+
#' all regionsets (bedfiles) in the collection of
7171
#' regionsets (bedset, or set of bedfiles)
7272
#'
7373
#' @param queryList GRangesList object with regionsets to be considered

bedboss/bedclassifier/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
from bedboss.bedclassifier.bedclassifier import get_bed_type

0 commit comments

Comments
 (0)