Skip to content

Commit f8ec717

Browse files
committed
Make keywords functional and allow integer output from frequency()
1 parent 7e0580e commit f8ec717

File tree

6 files changed

+66
-29
lines changed

6 files changed

+66
-29
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ This returns a dictionary with bases as keys and their frequency as values. Note
103103

104104
The start and end position are as with the `sequence()` method described above.
105105

106+
If integer counts are preferred, then they can instead be returned.
107+
108+
>>> tb.frequency("chr1", 24, 74, True)
109+
{'A': 6, 'C': 6, 'T': 6, 'G': 6}
110+
106111
## Close a file
107112

108113
A `TwoBit` object can be closed with the `close()` method.

lib2bit/2bit.c

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -272,14 +272,20 @@ void increment(char base, uint32_t *A, uint32_t *C, uint32_t *T, uint32_t *G) {
272272
}
273273
}
274274

275-
double *twobitFrequencyWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end) {
276-
double *out = malloc(4 * sizeof(double));
275+
void *twobitFrequencyWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t end, int fraction) {
276+
void *out;
277277
uint32_t sz = end - start, pos = 0;
278278
uint32_t A = 0, C = 0, T = 0, G = 0, len = end - start;
279279
uint32_t maskIdx = -1, maskStart = -1, maskEnd = -1;
280280
uint32_t blockStart, offset;
281281
uint8_t byte, base;
282282
int rv = 0;
283+
284+
if(fraction) {
285+
out = malloc(4 * sizeof(double));
286+
} else {
287+
out = malloc(4 * sizeof(uint32_t));
288+
}
283289
if(!out) return NULL;
284290

285291
getMask(tb, tid, start, end, &maskIdx, &maskStart, &maskEnd);
@@ -303,10 +309,17 @@ double *twobitFrequencyWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t
303309
if(rv != 1) offset = 0;
304310
}
305311

306-
out[0] = ((double) A)/((double) len);
307-
out[1] = ((double) C)/((double) len);
308-
out[2] = ((double) T)/((double) len);
309-
out[3] = ((double) G)/((double) len);
312+
if(fraction) {
313+
((double*) out)[0] = ((double) A)/((double) len);
314+
((double*) out)[1] = ((double) C)/((double) len);
315+
((double*) out)[2] = ((double) T)/((double) len);
316+
((double*) out)[3] = ((double) G)/((double) len);
317+
} else {
318+
((uint32_t*) out)[0] = A;
319+
((uint32_t*) out)[1] = C;
320+
((uint32_t*) out)[2] = T;
321+
((uint32_t*) out)[3] = G;
322+
}
310323

311324
return out;
312325

@@ -315,7 +328,7 @@ double *twobitFrequencyWorker(TwoBit *tb, uint32_t tid, uint32_t start, uint32_t
315328
return NULL;
316329
}
317330

318-
double *twobitFrequency(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
331+
void *twobitFrequency(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction) {
319332
uint32_t tid = 0, i;
320333

321334
//Get the chromosome ID
@@ -337,7 +350,7 @@ double *twobitFrequency(TwoBit *tb, char *chrom, uint32_t start, uint32_t end) {
337350
if(end > tb->idx->size[tid]) return NULL;
338351
if(start >= end) return NULL;
339352

340-
return twobitFrequencyWorker(tb, tid, start, end);
353+
return twobitFrequencyWorker(tb, tid, start, end, fraction);
341354
}
342355

343356
/*

lib2bit/2bit.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ uint32_t twobitChromLen(TwoBit *tb, char *chrom);
4747
//Return the sequence of the given range (or a whole chromosome if start=end=0
4848
char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end);
4949

50-
//We can try to assume that N is always a T and see if that's correct...
51-
double *twobitFrequency(TwoBit *tb, char *chrom, uint32_t start, uint32_t end);
50+
//Return a pointer to either 4 doubles or 4 uint32_ts holding per-base frequencies or counts.
51+
void *twobitFrequency(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fractional);
5252

5353
#ifdef __cplusplus
5454
}

py2bit.c

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22
#include <inttypes.h>
33
#include "py2bit.h"
44

5-
static PyObject *py2bitOpen(PyObject *self, PyObject *pyArgs) {
5+
static PyObject *py2bitOpen(PyObject *self, PyObject *args, PyObject *kwds) {
66
char *fname = NULL;
7-
PyObject *pystoreMasked = Py_False;
7+
PyObject *storeMaskedO = Py_False;
88
pyTwoBit_t *pytb;
99
int storeMasked = 0;
1010
TwoBit *tb = NULL;
11+
static char *kwd_list[] = {"fname", "storeMasked", NULL};
1112

12-
if(!PyArg_ParseTuple(pyArgs, "s|O", &fname, &pystoreMasked)) goto error;
13+
if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|O", kwd_list, &fname, &storeMaskedO)) goto error;
1314

14-
if(pystoreMasked == Py_True) storeMasked = 1;
15+
if(storeMaskedO == Py_True) storeMasked = 1;
1516

1617
//Open the file
1718
tb = twobitOpen(fname, storeMasked);
@@ -151,14 +152,15 @@ PyObject *PyString_FromString(char *seq) {
151152
}
152153
#endif
153154

154-
static PyObject *py2bitSequence(pyTwoBit_t *self, PyObject *args) {
155+
static PyObject *py2bitSequence(pyTwoBit_t *self, PyObject *args, PyObject *kwds) {
155156
PyObject *ret = NULL;
156157
TwoBit *tb = self->tb;
157158
char *seq, *chrom;
158159
unsigned long startl = 0, endl = 0;
159160
uint32_t start, end, len;
161+
static char *kwd_list[] = {"chrom", "start", "end", NULL};
160162

161-
if(!PyArg_ParseTuple(args, "s|kk", &chrom, &startl, &endl)) {
163+
if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|kk", kwd_list, &chrom, &startl, &endl)) {
162164
PyErr_SetString(PyExc_RuntimeError, "You must supply at least a chromosome!");
163165
return NULL;
164166
}
@@ -191,15 +193,18 @@ static PyObject *py2bitSequence(pyTwoBit_t *self, PyObject *args) {
191193
return ret;
192194
}
193195

194-
static PyObject *py2bitFrequency(pyTwoBit_t *self, PyObject *args) {
196+
static PyObject *py2bitFrequency(pyTwoBit_t *self, PyObject *args, PyObject *kwds) {
195197
PyObject *ret = NULL, *val = NULL;
198+
PyObject *fractionO = Py_True;
196199
TwoBit *tb = self->tb;
197200
char *chrom;
198-
double *o = NULL;
201+
void *o = NULL;
199202
unsigned long startl = 0, endl = 0;
200203
uint32_t start, end, len;
204+
static char *kwd_list[] = {"chrom", "start", "end", "fraction", NULL};
205+
int fraction = 1;
201206

202-
if(!PyArg_ParseTuple(args, "s|kk", &chrom, &startl, &endl)) {
207+
if(!PyArg_ParseTupleAndKeywords(args, kwds, "s|kkO", kwd_list, &chrom, &startl, &endl, &fractionO)) {
203208
PyErr_SetString(PyExc_RuntimeError, "You must supply at least a chromosome!");
204209
return NULL;
205210
}
@@ -217,7 +222,9 @@ static PyObject *py2bitFrequency(pyTwoBit_t *self, PyObject *args) {
217222
}
218223
start = (uint32_t) startl;
219224

220-
o = twobitFrequency(tb, chrom, start, end);
225+
if(fractionO == Py_False) fraction = 0;
226+
227+
o = twobitFrequency(tb, chrom, start, end, fraction);
221228
if(!o) {
222229
PyErr_SetString(PyExc_RuntimeError, "Received an error while determining the per-base frequency.");
223230
return NULL;
@@ -227,29 +234,35 @@ static PyObject *py2bitFrequency(pyTwoBit_t *self, PyObject *args) {
227234
if(!ret) goto error;
228235

229236
//A
230-
val = PyFloat_FromDouble(o[0]);
237+
if(fraction) val = PyFloat_FromDouble(((double*)o)[0]);
238+
else val = PyLong_FromUnsignedLong(((uint32_t*)o)[0]);
231239
if(!val) goto error;
232240
if(PyDict_SetItemString(ret, "A", val) == -1) goto error;
233241
Py_DECREF(val);
234242

235243
//C
236-
val = PyFloat_FromDouble(o[1]);
244+
if(fraction) val = PyFloat_FromDouble(((double*)o)[1]);
245+
else val = PyLong_FromUnsignedLong(((uint32_t*)o)[1]);
237246
if(!val) goto error;
238247
if(PyDict_SetItemString(ret, "C", val) == -1) goto error;
239248
Py_DECREF(val);
240249

241250
//T
242-
val = PyFloat_FromDouble(o[2]);
251+
if(fraction) val = PyFloat_FromDouble(((double*)o)[2]);
252+
else val = PyLong_FromUnsignedLong(((uint32_t*)o)[2]);
243253
if(!val) goto error;
244254
if(PyDict_SetItemString(ret, "T", val) == -1) goto error;
245255
Py_DECREF(val);
246256

247257
//G
248-
val = PyFloat_FromDouble(o[3]);
258+
if(fraction) val = PyFloat_FromDouble(((double*)o)[3]);
259+
else val = PyLong_FromUnsignedLong(((uint32_t*)o)[3]);
249260
if(!val) goto error;
250261
if(PyDict_SetItemString(ret, "G", val) == -1) goto error;
251262
Py_DECREF(val);
252263

264+
free(o);
265+
253266
return ret;
254267

255268
error:

py2bit.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ typedef struct {
77
int storeMasked; //Whether storeMasked was set. 0 = False, 1 = True
88
} pyTwoBit_t;
99

10-
static PyObject* py2bitOpen(PyObject *self, PyObject *args);
10+
static PyObject* py2bitOpen(PyObject *self, PyObject *args, PyObject *kwds);
1111
static PyObject *py2bitInfo(pyTwoBit_t *pybw, PyObject *args);
1212
static PyObject* py2bitClose(pyTwoBit_t *pybw, PyObject *args);
1313
static PyObject* py2bitChroms(pyTwoBit_t *pybw, PyObject *args);
14-
static PyObject *py2bitSequence(pyTwoBit_t *pybw, PyObject *args);
15-
static PyObject *py2bitFrequency(pyTwoBit_t *pybw, PyObject *args);
14+
static PyObject *py2bitSequence(pyTwoBit_t *pybw, PyObject *args, PyObject *kwds);
15+
static PyObject *py2bitFrequency(pyTwoBit_t *pybw, PyObject *args, PyObject *kwds);
1616
static void py2bitDealloc(pyTwoBit_t *pybw);
1717

1818
static PyMethodDef tbMethods[] = {
@@ -117,22 +117,27 @@ Positional arguments:\n\
117117
Keyword arguments:\n\
118118
start: Starting position (0-based)\n\
119119
end: Ending position (1-based)\n\
120+
fraction: Whether to return fractional or integer values (default 'True',\n\
121+
so fractional values are returned)\n\
120122
\n\
121123
Returns:\n\
122-
A dictionary with nucleotide as the key and fraction as the value.\n\
124+
A dictionary with nucleotide as the key and fraction (or count) as the\n\
125+
value.\n\
123126
\n\
124127
If start and end aren't specified, the entire chromosome is returned. If the\n\
125128
end value is beyond the end of the chromosome then it is adjusted accordingly.\n\
126129
\n\
127130
Note that the fractions will sum to much less than 1 if there are hard-masked\n\
128-
bases.\n\
131+
bases. Counts may sum to less than the length of the region for the same reason.\n\
129132
\n\
130133
>>> import py2bit\n\
131134
>>> tb = py2bit.open(\"test/test.2bit\")\n\
132135
>>> tb.frequency(tb, \"chr1\")\n\
133136
{'A': 0.08, 'C': 0.08, 'T': 0.08666666666666667, 'G': 0.08666666666666667}\n\
134137
>>> tb.frequency(tb, \"chr1\", 24, 74)\n\
135138
{'A': 0.12, 'C': 0.12, 'T': 0.12, 'G': 0.12}\n\
139+
>>> tb.frequency(tb, \"chr1\", 24, 74, True)\n\
140+
{'A': 6, 'C': 6, 'T': 6, 'G': 6}\n\
136141
>>> tb.close()"},
137142
{NULL, NULL, 0, NULL}
138143
};

py2bitTest/test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,5 @@ def testFrequency(self):
3939
tb = py2bit.open(self.fname, True)
4040
assert(tb.frequency("chr1") == {'A': 0.08, 'C': 0.08, 'T': 0.08666666666666667, 'G': 0.08666666666666667})
4141
assert(tb.frequency("chr1", 24, 74) == {'A': 0.12, 'C': 0.12, 'T': 0.12, 'G': 0.12})
42+
assert(tb.frequency("chr1", 24, 74, False) == {'A': 6, 'C': 6, 'T': 6, 'G': 6})
4243
tb.close()

0 commit comments

Comments
 (0)