Skip to content

Commit 5262774

Browse files
author
Shaurita Hutchins
authored
Improve documentation (#11)
* Add website and documentation links to setup.py * Add documentation to call_htseq function. * Refactored function argument. * Added docstring to csvtolist utils function. * Added docstrings.
1 parent 6a36a1e commit 5262774

File tree

8 files changed

+141
-65
lines changed

8 files changed

+141
-65
lines changed

HTSeqCountCluster/htseq_count_cluster.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,34 @@
1313

1414

1515
def call_htseq(infile, gtf, outfile):
16-
"""Call the htseq-count script."""
17-
cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(infile, gtf, outfile)
16+
"""Call the htseq-count script.
17+
18+
:param infile: An alignment file of aligned reads in SAM format.
19+
:type infile: str
20+
:param gtf: The gtf (Gene transfer format) file.
21+
:type gtf: str
22+
:param outfile: The name of the output SAM alignment file.
23+
:type outfile: str
24+
"""
25+
cmd = 'htseq-count -f bam -s no {} {} -o {}_htseq.out'.format(
26+
infile, gtf, outfile)
1827
return cmd
1928

2029

2130
def htseq_jobber(input_path, inputlist, gtf, outpath, email):
22-
"""Create multiple pbs jobs based on input list of files."""
31+
"""Create multiple pbs jobs based on input list of files.
32+
33+
:param input_path: [description]
34+
:type input_path: [type]
35+
:param inputlist: [description]
36+
:type inputlist: [type]
37+
:param gtf: The gtf (Gene transfer format) file.
38+
:type gtf: str
39+
:param outpath: [description]
40+
:type outpath: [type]
41+
:param email: An email address to send notifications.
42+
:type email: str
43+
"""
2344
jobids = []
2445
for item in inputlist:
2546
htseqjob = PBSJob(email_address=email, base_jobname=item)
@@ -33,7 +54,13 @@ def htseq_jobber(input_path, inputlist, gtf, outpath, email):
3354

3455

3556
def check_job_status(job_id, email=True):
36-
"""Use Qstat to monitor your job status."""
57+
"""Use Qstat to monitor your job status.
58+
59+
:param job_id: The job's id.
60+
:type job_id: str
61+
:param email: A flag to decide whether to send email, defaults to True
62+
:type email: bool, optional
63+
"""
3764
# TODO Allow either slack notifications or email or text.
3865
qwatch = Qstat().watch(job_id)
3966
if qwatch == 'Job id not found.':
@@ -51,13 +78,17 @@ def main():
5178
description=textwrap.dedent('''\
5279
This is a command line wrapper around htseq-count.
5380
'''))
54-
parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.', required=True)
55-
parser.add_argument('-f', '--infile', help='Name or path to your input csv file.', required=True)
56-
parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.', required=True)
81+
parser.add_argument('-p', '--inpath', help='Path of your samples/sample folders.',
82+
required=True)
83+
parser.add_argument('-f', '--infile', help='Name or path to your input csv file.',
84+
required=True)
85+
parser.add_argument('-g', '--gtf', help='Name or path to your gtf/gff file.',
86+
required=True)
5787
parser.add_argument('-o', '--outpath',
5888
help='Directory of your output counts file. The counts file will be named.',
5989
required=True)
60-
parser.add_argument('-e', '--email', help='Email address to send script completion to.')
90+
parser.add_argument('-e', '--email',
91+
help='Email address to send script completion to.')
6192

6293
args = parser.parse_args()
6394

HTSeqCountCluster/mergecounts.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,33 @@
11
# -*- coding: utf-8 -*-
2-
import pandas as pd
32
import os
43
import argparse
54
import textwrap
5+
6+
import pandas as pd
7+
68
from HTSeqCountCluster.logger import Logger
79

810
# Create a merge-counts logger
911
mc_log = Logger().default(logname="merge-counts", logfile=None)
1012

1113

12-
def merge_counts_tables(filesdirectory):
14+
def merge_counts_tables(files_dir):
1315
"""Merge multiple counts tables into 1 counts table.
1416
1517
After running htseq-count-cluster, there will be a counts table for each
1618
sample in the output directory. This function will use the genes column as
1719
the first column and then insert each subsequent sample name as column
1820
header with counts data as the column rows.
21+
22+
:param files_dir: The directory of the individual counts files.
23+
:type files_dir: str
1924
"""
2025
mc_log.info("Running merge-counts script.")
21-
if filesdirectory is ".":
22-
filesdirectory = os.getcwd()
26+
if files_dir is ".":
27+
files_dir = os.getcwd()
2328

24-
mc_log.info("Your directory location is: %s" % filesdirectory)
25-
files = os.listdir(filesdirectory)
29+
mc_log.info("Your directory location is: %s" % files_dir)
30+
files = os.listdir(files_dir)
2631

2732
samplenames = []
2833
sample_dfs = []
@@ -32,7 +37,7 @@ def merge_counts_tables(filesdirectory):
3237
if ext == 'out':
3338
samplename, barcode = filename.split('-')
3439
samplenames.append(samplename)
35-
filep = os.path.join(filesdirectory, file)
40+
filep = os.path.join(files_dir, file)
3641
data = pd.read_table(filep, header=None,
3742
names=['Genes', samplename])
3843
mc_log.info("A dataframe has been created for %s." % samplename)
@@ -67,7 +72,7 @@ def main():
6772
type=str)
6873
args = parser.parse_args()
6974

70-
merge_counts_tables(filesdirectory=args.directory)
75+
merge_counts_tables(files_dir=args.directory)
7176

7277

7378
if __name__ == '__main__':

HTSeqCountCluster/pbsjob/pbsconfig.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import sys
55
import os
66

7-
from HTSeqCountCluster.pbsjob.pbsutils import randomid
7+
from HTSeqCountCluster.pbsjob.pbsutils import random_id
88

99
if sys.version_info.major < 3:
1010
raise NotImplementedError('This is not designed for the python version in your \
@@ -15,23 +15,23 @@
1515

1616
_format1 = '%a %b %d %I:%M:%S %p %Y'
1717

18-
_jobname = 'htseq_{}'.format(randomid(length=4))
18+
_jobname = 'htseq_{}'.format(random_id(length=4))
1919

2020
__DEFAULT__ = {
21-
'author': getpass.getuser(),
22-
'description': 'This is a default pbs job.',
23-
'date': d.now().strftime(_format1),
24-
'proj_name': 'htseq-cluster',
25-
'select': '1',
26-
'memgb': '2gb',
27-
'cput': '24:00:00',
28-
'wt': '12:00:00',
29-
'job_name': _jobname,
30-
'outfile': _jobname + '.o',
31-
'errfile': _jobname + '.e',
32-
'script': _jobname,
33-
'log_name': _jobname,
34-
'pbsworkdir': os.getcwd(),
35-
'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
36-
'email': 'n/a'
37-
}
21+
'author': getpass.getuser(),
22+
'description': 'This is a default pbs job.',
23+
'date': d.now().strftime(_format1),
24+
'proj_name': 'htseq-cluster',
25+
'select': '1',
26+
'memgb': '2gb',
27+
'cput': '24:00:00',
28+
'wt': '12:00:00',
29+
'job_name': _jobname,
30+
'outfile': _jobname + '.o',
31+
'errfile': _jobname + '.e',
32+
'script': _jobname,
33+
'log_name': _jobname,
34+
'pbsworkdir': os.getcwd(),
35+
'cmd': 'python3 ' + os.path.join(os.getcwd(), _jobname + '.py'),
36+
'email': 'n/a'
37+
}

HTSeqCountCluster/pbsjob/pbsjob.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,16 @@
33
from pkg_resources import resource_filename
44

55
from HTSeqCountCluster.logger import Logger
6-
from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, writecodefile,
7-
import_temp, file2str)
6+
from HTSeqCountCluster.pbsjob.pbsutils import (basejobids, write_code_file,
7+
import_temp, file_to_str)
88
from HTSeqCountCluster.pbsjob.pbsconfig import __DEFAULT__
99
from HTSeqCountCluster import pbsjob
1010
from HTSeqCountCluster.pbsjob.qstat import Qstat
1111

1212

1313
class BasePBSJob(object):
1414
"""Base class for simple jobs."""
15+
1516
def __init__(self, base_jobname):
1617
"""Initialize job attributes."""
1718
self.default_job_attributes = __DEFAULT__
@@ -43,6 +44,7 @@ def _cleanup(self, jobname):
4344

4445
class PBSJob(BasePBSJob):
4546
"""Create a qsub/pbs job & script for the job to execute."""
47+
4648
def __init__(self, email_address, base_jobname=None):
4749
super().__init__(base_jobname=base_jobname)
4850
self.email = email_address
@@ -82,10 +84,13 @@ def submit_code(self, code, cleanup=True, default=True):
8284
code_str = code
8385

8486
if default:
85-
self.sgejob_log.info('You are running a job with default attributes.')
86-
writecodefile(filename=self.jobname, code=code_str, language='python')
87+
self.sgejob_log.info(
88+
'You are running a job with default attributes.')
89+
writecodefile(filename=self.jobname,
90+
code=code_str, language='python')
8791
pyfilename = self.jobname + '.py'
88-
self.sgejob_log.info('%s python file has been created.' % pyfilename)
92+
self.sgejob_log.info(
93+
'%s python file has been created.' % pyfilename)
8994

9095
# Create the pbs script from the template or dict
9196
pbstemp = import_temp(self.temp_pbs)
@@ -104,7 +109,8 @@ def submit_code(self, code, cleanup=True, default=True):
104109
try:
105110
cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command
106111
# Shell MUST be True
107-
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
112+
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
113+
shell=True, check=True)
108114
except CalledProcessError as err:
109115
self.sgejob_log.error(err.stderr.decode('utf-8'))
110116
if cleanup:
@@ -144,7 +150,8 @@ def submit_cmd(self, cmd, cleanup=True):
144150
try:
145151
cmd = ['qsub ' + self.jobname + '.pbs'] # this is the command
146152
# Shell MUST be True
147-
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
153+
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE,
154+
shell=True, check=True)
148155
except CalledProcessError as err:
149156
self.sgejob_log.error(err.stderr.decode('utf-8'))
150157
if cleanup:
@@ -162,4 +169,3 @@ def submit_cmd(self, cmd, cleanup=True):
162169

163170
else: # Unsuccessful. Stdout will be '1'
164171
self.sgejob_log.error('PBS job not submitted.')
165-

HTSeqCountCluster/pbsjob/pbsutils.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,26 @@
88

99

1010
def basejobids(length, name='submit'):
11-
""""Create base job id and name."""
12-
base_id = randomid(length=length)
11+
""""Create base job id and name.
12+
13+
:param length: [description]
14+
:type length: [type]
15+
:param name: [description], defaults to 'submit'
16+
:type name: str, optional
17+
:return: [description]
18+
:rtype: [type]
19+
"""
20+
base_id = random_id(length=length)
1321
base = name + "_{0}".format(base_id)
1422

1523
return base_id, base
1624

1725

1826
def import_temp(filepath):
19-
"""Import the script or file that you need a template of and that has
20-
temp strings.
27+
"""Import a template file that has template strings.
28+
29+
:param filepath: [description]
30+
:type filepath: [type]
2131
"""
2232
file_temp = open(filepath, 'r')
2333
file_str = file_temp.read()
@@ -27,19 +37,23 @@ def import_temp(filepath):
2737
return file_temp
2838

2939

30-
def file2str(filepath):
31-
"""Turn the contents of a file (python file) into a string."""
40+
def file_to_str(filepath):
41+
"""Turn the contents of a file (python file) into a string.
42+
43+
:param filepath: [description]
44+
:type filepath: [type]
45+
"""
3246
file_temp = open(filepath, 'r')
3347
file_str = file_temp.read()
3448
return file_str
3549

3650

37-
def randomid(length=5):
51+
def random_id(length=5):
3852
"""Generate a random ID of 5 characters to append to qsub job name."""
3953
return ''.join(random.sample(string.ascii_letters + string.digits, length))
4054

4155

42-
def writecodefile(filename, code, language):
56+
def write_code_file(filename, code, language):
4357
"""Create a python file and write the code to it."""
4458
if language == 'python':
4559
with open(filename + '.py', 'w') as pyfile:

HTSeqCountCluster/pbsjob/qstat.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,43 +3,54 @@
33
import getpass
44
import re
55

6+
from HTSeqCountCluster.logger import Logger
7+
68

79
class Qstat(object):
810
def __init__(self):
911
"""Initialize class."""
1012
_username = getpass.getuser()
1113
self.username = _username
1214
self.split_regex = re.compile(r'\s+')
15+
self.qstat_log = Logger().default(logname="qstat", logfile=None)
1316

1417
def qstatinfo(self, qstat_path='qstat'):
15-
"""Retrieve qstat output."""
18+
"""Retrieve qstat output.
19+
20+
:param qstat_path: [description], defaults to 'qstat'
21+
:type qstat_path: str, optional
22+
"""
1623
try:
1724
qstatinfo = check_output([qstat_path])
1825
except CalledProcessError as cpe:
1926
return_code = 'qstat returncode: %s' % cpe.returncode
2027
std_error = 'qstat standard output: %s' % cpe.stderr
21-
print(return_code + '\n' + std_error)
28+
self.qstat_log(return_code + '\n' + std_error)
2229
except FileNotFoundError:
2330
raise FileNotFoundError('qstat is not on your machine.')
31+
else:
32+
jobs = self._output_parser(qstatinfo)
2433

25-
jobs = self._output_parser(qstatinfo)
26-
27-
return jobs
34+
return jobs
2835

2936
def _output_parser(self, output):
3037
"""Parse output from qstat pbs commandline program.
3138
3239
Returns a list of dictionaries for each job.
40+
41+
:param output: The qstat output.
42+
:type output: [type]
3343
"""
3444
lines = output.decode('utf-8').split('\n')
3545
del lines[:5]
3646
jobs = []
3747
for line in lines:
3848
els = self.split_regex.split(line)
3949
try:
40-
j = {"job_id": els[0], "name": els[1], "user": els[2], "elapsed_time": els[3],
41-
"status": els[4], "queue": els[5]}
42-
jobs.append(j)
50+
j = {"job_id": els[0], "name": els[1], "user": els[2],
51+
"elapsed_time": els[3], "status": els[4],
52+
"queue": els[5]}
53+
jobs.append(j)
4354

4455
except IndexError:
4556
pass

HTSeqCountCluster/utils/__init__.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,17 @@
22
import pandas as pd
33

44

5-
def csvtolist(csvfile):
5+
def csvtolist(csvfile, column=0):
6+
"""Convert a column of a csv file to a list.
7+
8+
:param csvfile: A comma delimited file.
9+
:type csvfile: str
10+
:param column: The number of the column to convert.
11+
:type column: int
12+
:return: A list
13+
:rtype: list
14+
"""
615
df = pd.read_csv(csvfile, header=None)
7-
output_list = sorted(list(df[0]))
16+
output_list = sorted(list(df[column]))
817

918
return output_list

0 commit comments

Comments
 (0)