You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: lib2bit/2bit.h
+91-14Lines changed: 91 additions & 14 deletions
Original file line number
Diff line number
Diff line change
@@ -1,21 +1,58 @@
1
1
#include<inttypes.h>
2
2
#include<stdio.h>
3
3
4
+
/*! \mainpage libBigWig
5
+
*
6
+
* \section Introduction
7
+
*
8
+
* lib2bit is a C-based library for accessing [2bit files](https://genome.ucsc.edu/FAQ/FAQformat.html#format7). At the moment, only reading 2bit files is supported (there are no plans to change this, though if someone wants to submit a pull request...). Though it's unlikely to matter,
9
+
*
10
+
* The motivation for this project is due to needing fast access to 2bit files in [deepTools](https://github.com/fidelram/deepTools). Originally, we were using bx-python for this, which had the benefit of being easy to install and pretty quick. However, that wasn't compatible with python3, so we switched to [twobitreader](https://github.com/benjschiller/twobitreader). While doing everything we needed and working under both python2 and python3, it turns out that it has terrible performance (up to 1000x slow down in `computeGCBias`). Since we'd like to have our cake and eat it too, I began wrote a C library for convenient 2bit access and then [a python wrapper](https://github.com/dpryan79/py2bit) around it to work in python2 and 3.
11
+
*
12
+
* \section Installation
13
+
*
14
+
* 2bit files are very simple and there are no dependencies. Simply typing `make` should suffice for compilation. To install into a specific path (the default is `/usr/local`):
15
+
*
16
+
* make install prefix=/some/where/else
17
+
*
18
+
* `lib2bit.so` and `lib2bit.a` will then be in `/some/where/else/lib` and `2bit.h` in `/some/where/else/include`.
19
+
*
20
+
* \section Example
21
+
*
22
+
* See the `test/` directory for an example of using the library.
23
+
*/
24
+
25
+
/*! \file 2bit.h
26
+
*
27
+
* These are all functions and structures exported in lib2bit. There are a few things that could be more efficiently implemented, but at the moment theverything is "fast enough".
28
+
*/
29
+
4
30
#ifdef__cplusplus
5
31
extern"C" {
6
32
#endif
7
33
34
+
/*!
35
+
* @brief This structure holds the fixed-sized file header (16 bytes, of which 4 are blank). The version should always be 0. In theory, the endianness of the magic number can change (indicating that everything in the file should be swapped). As I've never actually seen this occur in the wild I've not bothered implementing it, though it'd be simple enough to do so.
36
+
*/
8
37
typedefstruct {
9
38
uint32_tmagic; /**<Holds the magic number, should be 0x1A412743 */
10
39
uint32_tversion; /**<File version, should be 0 */
11
40
uint32_tnChroms; /**<Number of chromosomes/contigs */
12
41
} TwoBitHeader;
13
42
43
+
/*!
44
+
* @brief This structure holds the chromosome names and the offset to the on-disk beginning of their sequences
45
+
*/
14
46
typedefstruct {
15
47
char**chrom; /**<A list of null terminated chromosomes */
16
48
uint32_t*offset; /**<The file offset for the beginning of each chromosome */
17
49
} TwoBitCL;
18
50
51
+
/*!
52
+
* @brief This structure holds the number, location and size of the hard (N) and soft (lower case) masked blocks.
53
+
*
54
+
* Note that this isn't a great data structure for random access, particularly for the soft-masked blocks. In practice, soft-masking is typically ignored and file access is less random and more blocky. Nonetheless, if performance is not acceptable then this is the structure to change.
55
+
*/
19
56
typedefstruct {
20
57
uint32_t*size; /**<The size of a given chromosome/contig */
21
58
uint32_t*nBlockCount; /**<The number of blocks of Ns in a given chromosome/contig */
@@ -27,31 +64,71 @@ typedef struct {
27
64
uint64_t*offset; /**<The offset to the packed 2-bit sequence */
28
65
} TwoBitMaskedIdx;
29
66
67
+
/*!
68
+
* @brief This is the main structure for holding a 2bit file
69
+
*
70
+
* Note that currently the 2bit file is mmap()ed prior to reading and that this isn't optional.
71
+
*/
30
72
typedefstruct {
31
-
FILE*fp; //The file pointer for the opened file
32
-
uint64_tsz; //File size in bytes (needed for munmap)
33
-
uint64_toffset; //If the file is memory mapped, then this is the current file offset (otherwise ignored)
34
-
void*data; //The memory mapped file, if it exists.
35
-
TwoBitHeader*hdr; //File header
36
-
TwoBitCL*cl; //Chromosome list with sizes
37
-
TwoBitMaskedIdx*idx; //Index of masked blocks
73
+
FILE*fp; /**<The file pointer for the opened file */
74
+
uint64_tsz; /**<File size in bytes (needed for munmap) */
75
+
uint64_toffset; /**<If the file is memory mapped, then this is the current file offset (otherwise ignored) */
76
+
void*data; /**<The memory mapped file, if it exists. */
77
+
TwoBitHeader*hdr; /**<File header */
78
+
TwoBitCL*cl; /**<Chromosome list with sizes */
79
+
TwoBitMaskedIdx*idx; /**<Index of masked blocks */
38
80
} TwoBit;
39
81
40
-
//Open/close functions
82
+
/*!
83
+
* @brief Opens a local 2bit file
84
+
*
85
+
* @param fname The name of the 2bit file.
86
+
* @param storeMasked Whether soft-masking information should be stored. If this is 1 then soft-masking information will be stored and the `twobitSequence()` function will return lower case letters in soft-masked regions. Note that this has a considerable performance and memory impact.
87
+
* @return A pointer to a TwoBit object.
88
+
* @note The file is memory mapped.
89
+
*/
41
90
TwoBit*twobitOpen(char*fname, intstoreMasked);
91
+
92
+
/*!
93
+
* @brief Closes a 2bit file and free memory.
94
+
*/
42
95
voidtwobitClose(TwoBit*tb);
43
96
44
-
//Return the length of a given chromosome/contig or 0 if not present
97
+
/*!
98
+
* @brief Returns the length of a given chromosome.
99
+
*
100
+
* @param tb A pointer to a TwoBit object.
101
+
* @param chrom The chromosome name.
102
+
* @return The chromosome length as a uint32_t. Note that if the chromosome/contig isn't present in the file that 0 is returned.
103
+
*/
45
104
uint32_ttwobitChromLen(TwoBit*tb, char*chrom);
46
105
47
-
//Return the sequence of the given range (or a whole chromosome if start=end=0
106
+
/*!
107
+
* @brief Returns the sequence of a chromosome/contig or range of it.
108
+
*
109
+
* @param tb A pointer to a TwoBit object.
110
+
* @param chrom The chromosome name.
111
+
* @param start The starting position in 0-based coordinates.
112
+
* @param end The end position in 1-based coordinates.
113
+
* @return The sequence or NULL on error. If both start and end are 0 then the sequence for the entire chromosome/contig is returned.
114
+
* @note The result MUST be `free()`d. Care is taken to return reasonable sequences when illegal regions are requested. If the end value is beyond the possible end of the chromosome then it is modified according.
* @brief Return the number/fraction of A, C, T, and G in a chromosome/region
120
+
*
121
+
* @param tb A pointer to a TwoBit object.
122
+
* @param chrom The chromosome name.
123
+
* @param start The starting position in 0-based coordinates.
124
+
* @param end The end position in 1-based coordinates.
125
+
* @param fraction Whether to return the values as fractions (1) or integers (0).
126
+
* @return If fraction is not 0, then 4 `double`s with the fraction of bases as A, C, T and G, respectively. If fraction is 1, integer counts are returned as 4 `uint32_t`s in the aforementioned order.
127
+
* @note On error NULL is returned. The result MUST be `free()`d.
0 commit comments