Skip to content

Commit 373e60c

Browse files
committed
Copy over annotated 2bit.h
1 parent 523b124 commit 373e60c

File tree

1 file changed

+91
-14
lines changed

1 file changed

+91
-14
lines changed

lib2bit/2bit.h

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,58 @@
11
#include <inttypes.h>
22
#include <stdio.h>
33

4+
/*! \mainpage libBigWig
5+
*
6+
* \section Introduction
7+
*
8+
* lib2bit is a C-based library for accessing [2bit files](https://genome.ucsc.edu/FAQ/FAQformat.html#format7). At the moment, only reading 2bit files is supported (there are no plans to change this, though if someone wants to submit a pull request...). Though it's unlikely to matter,
9+
*
10+
* The motivation for this project is due to needing fast access to 2bit files in [deepTools](https://github.com/fidelram/deepTools). Originally, we were using bx-python for this, which had the benefit of being easy to install and pretty quick. However, that wasn't compatible with python3, so we switched to [twobitreader](https://github.com/benjschiller/twobitreader). While doing everything we needed and working under both python2 and python3, it turns out that it has terrible performance (up to 1000x slow down in `computeGCBias`). Since we'd like to have our cake and eat it too, I began wrote a C library for convenient 2bit access and then [a python wrapper](https://github.com/dpryan79/py2bit) around it to work in python2 and 3.
11+
*
12+
* \section Installation
13+
*
14+
* 2bit files are very simple and there are no dependencies. Simply typing `make` should suffice for compilation. To install into a specific path (the default is `/usr/local`):
15+
*
16+
* make install prefix=/some/where/else
17+
*
18+
* `lib2bit.so` and `lib2bit.a` will then be in `/some/where/else/lib` and `2bit.h` in `/some/where/else/include`.
19+
*
20+
* \section Example
21+
*
22+
* See the `test/` directory for an example of using the library.
23+
*/
24+
25+
/*! \file 2bit.h
26+
*
27+
* These are all functions and structures exported in lib2bit. There are a few things that could be more efficiently implemented, but at the moment theverything is "fast enough".
28+
*/
29+
430
#ifdef __cplusplus
531
extern "C" {
632
#endif
733

34+
/*!
35+
* @brief This structure holds the fixed-sized file header (16 bytes, of which 4 are blank). The version should always be 0. In theory, the endianness of the magic number can change (indicating that everything in the file should be swapped). As I've never actually seen this occur in the wild I've not bothered implementing it, though it'd be simple enough to do so.
36+
*/
837
typedef struct {
938
uint32_t magic; /**<Holds the magic number, should be 0x1A412743 */
1039
uint32_t version; /**<File version, should be 0 */
1140
uint32_t nChroms; /**<Number of chromosomes/contigs */
1241
} TwoBitHeader;
1342

43+
/*!
44+
* @brief This structure holds the chromosome names and the offset to the on-disk beginning of their sequences
45+
*/
1446
typedef struct {
1547
char **chrom; /**<A list of null terminated chromosomes */
1648
uint32_t *offset; /**<The file offset for the beginning of each chromosome */
1749
} TwoBitCL;
1850

51+
/*!
52+
* @brief This structure holds the number, location and size of the hard (N) and soft (lower case) masked blocks.
53+
*
54+
* Note that this isn't a great data structure for random access, particularly for the soft-masked blocks. In practice, soft-masking is typically ignored and file access is less random and more blocky. Nonetheless, if performance is not acceptable then this is the structure to change.
55+
*/
1956
typedef struct {
2057
uint32_t *size; /**<The size of a given chromosome/contig */
2158
uint32_t *nBlockCount; /**<The number of blocks of Ns in a given chromosome/contig */
@@ -27,31 +64,71 @@ typedef struct {
2764
uint64_t *offset; /**<The offset to the packed 2-bit sequence */
2865
} TwoBitMaskedIdx;
2966

67+
/*!
68+
* @brief This is the main structure for holding a 2bit file
69+
*
70+
* Note that currently the 2bit file is mmap()ed prior to reading and that this isn't optional.
71+
*/
3072
typedef struct {
31-
FILE *fp; //The file pointer for the opened file
32-
uint64_t sz; //File size in bytes (needed for munmap)
33-
uint64_t offset; //If the file is memory mapped, then this is the current file offset (otherwise ignored)
34-
void *data; //The memory mapped file, if it exists.
35-
TwoBitHeader *hdr; //File header
36-
TwoBitCL *cl; //Chromosome list with sizes
37-
TwoBitMaskedIdx *idx; //Index of masked blocks
73+
FILE *fp; /**<The file pointer for the opened file */
74+
uint64_t sz; /**<File size in bytes (needed for munmap) */
75+
uint64_t offset; /**<If the file is memory mapped, then this is the current file offset (otherwise ignored) */
76+
void *data; /**<The memory mapped file, if it exists. */
77+
TwoBitHeader *hdr; /**<File header */
78+
TwoBitCL *cl; /**<Chromosome list with sizes */
79+
TwoBitMaskedIdx *idx; /**<Index of masked blocks */
3880
} TwoBit;
3981

40-
//Open/close functions
82+
/*!
83+
* @brief Opens a local 2bit file
84+
*
85+
* @param fname The name of the 2bit file.
86+
* @param storeMasked Whether soft-masking information should be stored. If this is 1 then soft-masking information will be stored and the `twobitSequence()` function will return lower case letters in soft-masked regions. Note that this has a considerable performance and memory impact.
87+
* @return A pointer to a TwoBit object.
88+
* @note The file is memory mapped.
89+
*/
4190
TwoBit* twobitOpen(char *fname, int storeMasked);
91+
92+
/*!
93+
* @brief Closes a 2bit file and free memory.
94+
*/
4295
void twobitClose(TwoBit *tb);
4396

44-
//Return the length of a given chromosome/contig or 0 if not present
97+
/*!
98+
* @brief Returns the length of a given chromosome.
99+
*
100+
* @param tb A pointer to a TwoBit object.
101+
* @param chrom The chromosome name.
102+
* @return The chromosome length as a uint32_t. Note that if the chromosome/contig isn't present in the file that 0 is returned.
103+
*/
45104
uint32_t twobitChromLen(TwoBit *tb, char *chrom);
46105

47-
//Return the sequence of the given range (or a whole chromosome if start=end=0
106+
/*!
107+
* @brief Returns the sequence of a chromosome/contig or range of it.
108+
*
109+
* @param tb A pointer to a TwoBit object.
110+
* @param chrom The chromosome name.
111+
* @param start The starting position in 0-based coordinates.
112+
* @param end The end position in 1-based coordinates.
113+
* @return The sequence or NULL on error. If both start and end are 0 then the sequence for the entire chromosome/contig is returned.
114+
* @note The result MUST be `free()`d. Care is taken to return reasonable sequences when illegal regions are requested. If the end value is beyond the possible end of the chromosome then it is modified according.
115+
*/
48116
char *twobitSequence(TwoBit *tb, char *chrom, uint32_t start, uint32_t end);
49117

50-
//Return a pointer to either 4 doubles or 4 uint32_ts holding per-base frequencies or counts.
51-
//The order is A, C, T, G
52-
void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fractional);
118+
/*!
119+
* @brief Return the number/fraction of A, C, T, and G in a chromosome/region
120+
*
121+
* @param tb A pointer to a TwoBit object.
122+
* @param chrom The chromosome name.
123+
* @param start The starting position in 0-based coordinates.
124+
* @param end The end position in 1-based coordinates.
125+
* @param fraction Whether to return the values as fractions (1) or integers (0).
126+
* @return If fraction is not 0, then 4 `double`s with the fraction of bases as A, C, T and G, respectively. If fraction is 1, integer counts are returned as 4 `uint32_t`s in the aforementioned order.
127+
* @note On error NULL is returned. The result MUST be `free()`d.
128+
*/
129+
130+
void *twobitBases(TwoBit *tb, char *chrom, uint32_t start, uint32_t end, int fraction);
53131

54132
#ifdef __cplusplus
55133
}
56134
#endif
57-

0 commit comments

Comments
 (0)