Skip to content

Commit 4dcc9f3

Browse files
committed
improve the speed by skipping large low complexity cluster
which is usually caused by mapping errors
1 parent 209bdbb commit 4dcc9f3

File tree

6 files changed

+109
-103
lines changed

6 files changed

+109
-103
lines changed

src/cluster.cpp

Lines changed: 100 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,31 @@ Cluster::Cluster(Options* opt){
88
}
99

1010
Cluster::~Cluster(){
11-
for(int i=0; i<mPairs.size(); i++) {
12-
delete mPairs[i];
13-
mPairs[i] = NULL;
11+
map<string, Pair*>::iterator iter;
12+
for(iter = mPairs.begin(); iter!=mPairs.end(); iter++) {
13+
delete iter->second;
1414
}
1515
}
1616

1717
void Cluster::addPair(Pair* p){
18-
mPairs.push_back(p);
18+
string qname = p->getQName();
19+
if(mPairs.count(qname)>0)
20+
delete mPairs[qname];
21+
mPairs[qname] = p;
1922
}
2023

2124
void Cluster::dump(){
22-
for(int i=0; i<mPairs.size(); i++) {
23-
mPairs[i]->dump();
25+
map<string, Pair*>::iterator iter;
26+
for(iter = mPairs.begin(); iter!=mPairs.end(); iter++) {
27+
iter->second->dump();
2428
}
2529
}
2630

2731
bool Cluster::matches(Pair* p){
28-
for(int i=0; i<mPairs.size(); i++) {
29-
if(mPairs[i]->isDupWith(p))
30-
return true;
32+
map<string, Pair*>::iterator iter;
33+
for(iter = mPairs.begin(); iter!=mPairs.end(); iter++) {
34+
if(iter->second->isDupWith(p))
35+
return true;
3136
}
3237
return false;
3338
}
@@ -93,8 +98,9 @@ int Cluster::umiDiff(const string& umi1, const string& umi2) {
9398
vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
9499
vector<Cluster*> subClusters;
95100
map<string, int> umiCount;
96-
for(int i=0; i<mPairs.size(); i++) {
97-
string umi = mPairs[i]->getUMI();
101+
map<string, Pair*>::iterator iterOfPairs;
102+
for(iterOfPairs = mPairs.begin(); iterOfPairs!=mPairs.end(); iterOfPairs++) {
103+
string umi = iterOfPairs->second->getUMI();
98104
umiCount[umi]++;
99105
}
100106
while(mPairs.size()>0) {
@@ -112,9 +118,9 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
112118
Cluster* c = new Cluster(mOptions);
113119

114120
// create the group by the top UMI
115-
vector<Pair*>::iterator piter;
121+
map<string, Pair*>::iterator piter;
116122
for(piter = mPairs.begin(); piter!=mPairs.end();){
117-
Pair* p = *piter;
123+
Pair* p = piter->second;
118124
string umi = p->getUMI();
119125
if(umiDiff(umi, topUMI) <= umiDiffThreshold) {
120126
c->addPair(p);
@@ -146,13 +152,6 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
146152
}
147153

148154
Pair* Cluster::consensusMerge() {
149-
/*if(mPairs.size() == 1) {
150-
Pair* p = mPairs[mPairs.size()-1];
151-
p->mMergeReads = mPairs.size();
152-
mPairs.pop_back();
153-
return p;
154-
}*/
155-
156155
int leftDiff = 0;
157156
int rightDiff = 0;
158157
bam1_t* left = consensusMergeBam(true, leftDiff);
@@ -181,45 +180,88 @@ Pair* Cluster::consensusMerge() {
181180
}
182181

183182
bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
183+
if(mPairs.size() < mOptions->clusterSizeReq) {
184+
return NULL;
185+
}
186+
vector<Pair*> allPairs;
187+
map<string, Pair*>::iterator iterOfPairs;
188+
for(iterOfPairs = mPairs.begin(); iterOfPairs!=mPairs.end(); iterOfPairs++) {
189+
allPairs.push_back(iterOfPairs->second);
190+
}
191+
if(mPairs.size() > mOptions->skipLowComplexityClusterThreshold) {
192+
map<string, int> cigars;
193+
bam1_t* firstRead = NULL;
194+
for(iterOfPairs = mPairs.begin(); iterOfPairs!=mPairs.end(); iterOfPairs++) {
195+
Pair* p = iterOfPairs->second;
196+
bam1_t* b = p->mLeft;
197+
if(!isLeft)
198+
b = p->mRight;
199+
if(b) {
200+
string qname = BamUtil::getQName(b);
201+
if(cigars.count(qname) == 0)
202+
cigars[qname] = 1;
203+
else
204+
cigars[qname]++;
205+
if(!firstRead)
206+
firstRead = b;
207+
}
208+
}
209+
// this is abnormal, usually due to mapping result of low complexity reads
210+
if(cigars.size() > mPairs.size() * 0.5 && firstRead) {
211+
string seq = BamUtil::getSeq(firstRead);
212+
int diffNeighbor = 0;
213+
for(int i=0;i<seq.length()-1;i++) {
214+
if(seq[i] != seq[i+1])
215+
diffNeighbor++;
216+
}
217+
if(diffNeighbor < seq.length()*0.5) {
218+
if(mOptions->debug) {
219+
cerr << "Skipping " << mPairs.size() << " low complexity reads like: " << seq << endl;
220+
}
221+
return NULL;
222+
}
223+
}
224+
}
225+
184226
bool leftReadMode = isLeft;
185227
// if processing right reads, check if this group is aligned by left
186228
if(!isLeft) {
187229
bool leftAligned = true;
188230
int lastPos = -1;
189-
for(int i=0; i<mPairs.size(); i++) {
190-
if(mPairs[i]->mRight) {
191-
if(lastPos >= 0 && mPairs[i]->mRight->core.pos != lastPos) {
231+
for(int i=0; i<allPairs.size(); i++) {
232+
if(allPairs[i]->mRight) {
233+
if(lastPos >= 0 && allPairs[i]->mRight->core.pos != lastPos) {
192234
leftAligned = false;
193235
break;
194236
}
195-
lastPos = mPairs[i]->mRight->core.pos;
237+
lastPos = allPairs[i]->mRight->core.pos;
196238
}
197239
}
198240
// if it's left aligned, then process them as left reads
199241
if(leftAligned)
200242
leftReadMode = true;
201243
}
202244
// first we get a read that is most contained by other reads
203-
vector<int> containedByList(mPairs.size(), 0);
204-
for(int i=0; i<mPairs.size(); i++) {
245+
vector<int> containedByList(allPairs.size(), 0);
246+
for(int i=0; i<allPairs.size(); i++) {
205247
bam1_t* part = NULL;
206248
if(isLeft)
207-
part = mPairs[i]->mLeft;
249+
part = allPairs[i]->mLeft;
208250
else
209-
part = mPairs[i]->mRight;
251+
part = allPairs[i]->mRight;
210252
if(part == NULL)
211253
continue;
212254

213255
int containedBy = 1;
214256

215-
for(int j=0; j<mPairs.size(); j++) {
257+
for(int j=0; j<allPairs.size(); j++) {
216258
if(i == j)
217259
continue;
218260
bam1_t* whole = NULL;
219261
if(isLeft)
220-
whole = mPairs[j]->mLeft;
262+
whole = allPairs[j]->mLeft;
221263
else
222-
whole = mPairs[j]->mRight;
264+
whole = allPairs[j]->mRight;
223265
if(whole == NULL)
224266
continue;
225267

@@ -248,15 +290,15 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
248290
int thisLen = 0;
249291
int curLen = 0;
250292
if(isLeft) {
251-
if(mPairs[i]->mLeft)
252-
thisLen = mPairs[i]->mLeft->core.l_qseq;
253-
if(mPairs[mostContainedById]->mLeft)
254-
curLen = mPairs[mostContainedById]->mLeft->core.l_qseq;
293+
if(allPairs[i]->mLeft)
294+
thisLen = allPairs[i]->mLeft->core.l_qseq;
295+
if(allPairs[mostContainedById]->mLeft)
296+
curLen = allPairs[mostContainedById]->mLeft->core.l_qseq;
255297
} else {
256-
if(mPairs[i]->mRight)
257-
thisLen = mPairs[i]->mRight->core.l_qseq;
258-
if(mPairs[mostContainedById]->mRight)
259-
curLen = mPairs[mostContainedById]->mRight->core.l_qseq;
298+
if(allPairs[i]->mRight)
299+
thisLen = allPairs[i]->mRight->core.l_qseq;
300+
if(allPairs[mostContainedById]->mRight)
301+
curLen = allPairs[mostContainedById]->mRight->core.l_qseq;
260302
}
261303
if(thisLen < curLen) {
262304
mostContainedByNum = containedByList[i];
@@ -266,22 +308,23 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
266308
}
267309

268310
// no marjority
269-
if(mostContainedByNum < containedByList.size()*0.4 && containedByList.size() != 1)
311+
if(mostContainedByNum < containedByList.size()*0.4 && containedByList.size() != 1) {
270312
return NULL;
313+
}
271314

272315
bam1_t* out = NULL;
273316
char* outScore = NULL;
274317
if(isLeft) {
275-
out = mPairs[mostContainedById]->mLeft;
276-
outScore = mPairs[mostContainedById]->getLeftScore();
318+
out = allPairs[mostContainedById]->mLeft;
319+
outScore = allPairs[mostContainedById]->getLeftScore();
277320
// make it null so that it will not be deleted
278-
mPairs[mostContainedById]->mLeft = NULL;
321+
allPairs[mostContainedById]->mLeft = NULL;
279322
}
280323
else {
281-
out = mPairs[mostContainedById]->mRight;
282-
outScore = mPairs[mostContainedById]->getRightScore();
324+
out = allPairs[mostContainedById]->mRight;
325+
outScore = allPairs[mostContainedById]->getRightScore();
283326
// make it null so that it will not be deleted
284-
mPairs[mostContainedById]->mRight = NULL;
327+
allPairs[mostContainedById]->mRight = NULL;
285328
}
286329

287330
if(out == NULL) {
@@ -294,18 +337,18 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
294337
reads.push_back(out);
295338
scores.push_back(outScore);
296339

297-
for(int j=0; j<mPairs.size(); j++) {
340+
for(int j=0; j<allPairs.size(); j++) {
298341
if(mostContainedById == j)
299342
continue;
300343
bam1_t* read = NULL;
301344
char* score = NULL;
302345
if(isLeft) {
303-
read = mPairs[j]->mLeft;
304-
score = mPairs[j]->getLeftScore();
346+
read = allPairs[j]->mLeft;
347+
score = allPairs[j]->getLeftScore();
305348
}
306349
else {
307-
read = mPairs[j]->mRight;
308-
score = mPairs[j]->getRightScore();
350+
read = allPairs[j]->mRight;
351+
score = allPairs[j]->getRightScore();
309352
}
310353
if(read == NULL || score == NULL)
311354
continue;
@@ -322,32 +365,6 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
322365
return NULL;
323366
}
324367

325-
// if the sequences are right ones of pairs, we check whether it a completely a chaos
326-
/*if(!isLeft) {
327-
int bothEndNotAligned = 0;
328-
for(int r=0; r<reads.size(); r++) {
329-
// left aligned
330-
if(reads[r]->core.pos == out->core.pos && BamUtil::isPartOf(out, reads[r], true))
331-
continue;
332-
// right aligned
333-
if(reads[r]->core.pos + bam_cigar2rlen(reads[r]->core.n_cigar, (uint32_t *)bam_get_cigar(reads[r])) == out->core.pos + bam_cigar2rlen(out->core.n_cigar, (uint32_t *)bam_get_cigar(out)))
334-
continue;
335-
336-
// both not aligned
337-
bothEndNotAligned++;
338-
}
339-
340-
if(bothEndNotAligned*2 >= reads.size()) {
341-
cerr << "Chaos of " << reads.size() << " reads: " << BamUtil::getQName(out) << endl;
342-
for(int r=0; r<reads.size(); r++) {
343-
cerr << reads[r]->core.pos << "," << reads[r]->core.mpos << "," << reads[r]->core.isize << "," << reads[r]->core.l_qseq << "," << BamUtil::getCigar(reads[r]) << endl;
344-
}
345-
bam_destroy1(out);
346-
out = NULL;
347-
return NULL;
348-
}
349-
}*/
350-
351368
diff = makeConsensus(reads, out, scores, leftReadMode);
352369

353370
return out;
@@ -615,36 +632,20 @@ void Cluster::addRead(bam1_t* b) {
615632
if(b->core.isize > 0) {
616633
Pair* p = new Pair(mOptions);
617634
p->setLeft(b);
618-
mPairs.push_back(p);
635+
mPairs[p->getQName()]=p;
619636
return;
620637
}
621-
// right or unproper paired
622-
bool found = false;
623-
string qname = BamUtil::getQName(b);
624-
for(int i=0; i<mPairs.size(); i++) {
625-
Pair* p = mPairs[i];
626-
if(p->mLeft && !p->mRight) {
627-
if(p->getQName() == qname) {
628-
p->setRight(b);
629-
found = true;
630-
break;
631-
}
632-
} else if(p->mRight && !p->mLeft) {
633-
if(p->getQName() == qname) {
634-
p->setLeft(b);
635-
found = true;
636-
break;
637-
}
638-
}
639-
}
640638

641-
if(!found) {
639+
string qname = BamUtil::getQName(b);
640+
if(mPairs.count(qname) > 0)
641+
mPairs[qname]->setRight(b);
642+
else {
642643
Pair* p = new Pair(mOptions);
643644
if(b->core.isize < 0)
644645
p->setRight(b);
645646
else
646647
p->setLeft(b);
647-
mPairs.push_back(p);
648+
mPairs[qname]=p;
648649
}
649650
}
650651

src/cluster.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class Cluster {
4242
static int umiDiff(const string& umi1, const string& umi2);
4343

4444
public:
45-
vector<Pair*> mPairs;
45+
map<string, Pair*> mPairs;
4646
Options* mOptions;
4747
};
4848

src/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#ifndef COMMON_H
22
#define COMMON_H
33

4-
#define VERSION_NUMBER "0.7.0"
4+
#define VERSION_NUMBER "0.8.0"
55

66
#define _DEBUG false
77

src/gencore.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,10 @@ void Gencore::finishConsensus(map<int, map<int, map<int, Cluster*>>>& clusters)
229229
for(iter3 = iter2->second.begin(); iter3 != iter2->second.end(); ) {
230230
// for unmapped reads, we just store them
231231
if(iter1->first < 0 || iter2->first < 0 || iter3->first < 0 ) {
232-
for(int i=0; i<iter3->second->mPairs.size(); i++) {
232+
map<string, Pair*>::iterator iterOfPairs;
233+
for(iterOfPairs = iter3->second->mPairs.begin(); iterOfPairs!=iter3->second->mPairs.end(); iterOfPairs++) {
233234
//csPairs[i]->dump();
234-
outputPair(iter3->second->mPairs[i]);
235+
outputPair(iterOfPairs->second);
235236
}
236237
} else {
237238
vector<Pair*> csPairs = iter3->second->clusterByUMI(mOptions->unproperReadsUmiDiffThreshold);

src/options.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Options::Options(){
2929
scoreOfUnbalancedMismatchLowQuality = 1;
3030

3131
baseScoreReq = scoreOfNotOverlapped;
32+
skipLowComplexityClusterThreshold = 1000;
3233
}
3334

3435
bool Options::validate() {

src/options.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ class Options{
4444
char scoreOfBothLowQualityMismatch;
4545
char scoreOfUnbalancedMismatchHighQuality;
4646
char scoreOfUnbalancedMismatchLowQuality;
47+
48+
// threshold for skipping low complexity cluster
49+
int skipLowComplexityClusterThreshold;
4750
};
4851

4952
#endif

0 commit comments

Comments
 (0)