@@ -8,26 +8,31 @@ Cluster::Cluster(Options* opt){
8
8
}
9
9
10
10
Cluster::~Cluster (){
11
- for ( int i= 0 ; i< mPairs . size (); i++) {
12
- delete mPairs [i];
13
- mPairs [i] = NULL ;
11
+ map<string, Pair*>::iterator iter;
12
+ for (iter = mPairs . begin (); iter!= mPairs . end (); iter++) {
13
+ delete iter-> second ;
14
14
}
15
15
}
16
16
17
17
void Cluster::addPair (Pair* p){
18
- mPairs .push_back (p);
18
+ string qname = p->getQName ();
19
+ if (mPairs .count (qname)>0 )
20
+ delete mPairs [qname];
21
+ mPairs [qname] = p;
19
22
}
20
23
21
24
void Cluster::dump (){
22
- for (int i=0 ; i<mPairs .size (); i++) {
23
- mPairs [i]->dump ();
25
+ map<string, Pair*>::iterator iter;
26
+ for (iter = mPairs .begin (); iter!=mPairs .end (); iter++) {
27
+ iter->second ->dump ();
24
28
}
25
29
}
26
30
27
31
bool Cluster::matches (Pair* p){
28
- for (int i=0 ; i<mPairs .size (); i++) {
29
- if (mPairs [i]->isDupWith (p))
30
- return true ;
32
+ map<string, Pair*>::iterator iter;
33
+ for (iter = mPairs .begin (); iter!=mPairs .end (); iter++) {
34
+ if (iter->second ->isDupWith (p))
35
+ return true ;
31
36
}
32
37
return false ;
33
38
}
@@ -93,8 +98,9 @@ int Cluster::umiDiff(const string& umi1, const string& umi2) {
93
98
vector<Pair*> Cluster::clusterByUMI (int umiDiffThreshold) {
94
99
vector<Cluster*> subClusters;
95
100
map<string, int > umiCount;
96
- for (int i=0 ; i<mPairs .size (); i++) {
97
- string umi = mPairs [i]->getUMI ();
101
+ map<string, Pair*>::iterator iterOfPairs;
102
+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
103
+ string umi = iterOfPairs->second ->getUMI ();
98
104
umiCount[umi]++;
99
105
}
100
106
while (mPairs .size ()>0 ) {
@@ -112,9 +118,9 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
112
118
Cluster* c = new Cluster (mOptions );
113
119
114
120
// create the group by the top UMI
115
- vector< Pair*>::iterator piter;
121
+ map<string, Pair*>::iterator piter;
116
122
for (piter = mPairs .begin (); piter!=mPairs .end ();){
117
- Pair* p = * piter;
123
+ Pair* p = piter-> second ;
118
124
string umi = p->getUMI ();
119
125
if (umiDiff (umi, topUMI) <= umiDiffThreshold) {
120
126
c->addPair (p);
@@ -146,13 +152,6 @@ vector<Pair*> Cluster::clusterByUMI(int umiDiffThreshold) {
146
152
}
147
153
148
154
Pair* Cluster::consensusMerge () {
149
- /* if(mPairs.size() == 1) {
150
- Pair* p = mPairs[mPairs.size()-1];
151
- p->mMergeReads = mPairs.size();
152
- mPairs.pop_back();
153
- return p;
154
- }*/
155
-
156
155
int leftDiff = 0 ;
157
156
int rightDiff = 0 ;
158
157
bam1_t * left = consensusMergeBam (true , leftDiff);
@@ -181,45 +180,88 @@ Pair* Cluster::consensusMerge() {
181
180
}
182
181
183
182
bam1_t * Cluster::consensusMergeBam (bool isLeft, int & diff) {
183
+ if (mPairs .size () < mOptions ->clusterSizeReq ) {
184
+ return NULL ;
185
+ }
186
+ vector<Pair*> allPairs;
187
+ map<string, Pair*>::iterator iterOfPairs;
188
+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
189
+ allPairs.push_back (iterOfPairs->second );
190
+ }
191
+ if (mPairs .size () > mOptions ->skipLowComplexityClusterThreshold ) {
192
+ map<string, int > cigars;
193
+ bam1_t * firstRead = NULL ;
194
+ for (iterOfPairs = mPairs .begin (); iterOfPairs!=mPairs .end (); iterOfPairs++) {
195
+ Pair* p = iterOfPairs->second ;
196
+ bam1_t * b = p->mLeft ;
197
+ if (!isLeft)
198
+ b = p->mRight ;
199
+ if (b) {
200
+ string qname = BamUtil::getQName (b);
201
+ if (cigars.count (qname) == 0 )
202
+ cigars[qname] = 1 ;
203
+ else
204
+ cigars[qname]++;
205
+ if (!firstRead)
206
+ firstRead = b;
207
+ }
208
+ }
209
+ // this is abnormal, usually due to mapping result of low complexity reads
210
+ if (cigars.size () > mPairs .size () * 0.5 && firstRead) {
211
+ string seq = BamUtil::getSeq (firstRead);
212
+ int diffNeighbor = 0 ;
213
+ for (int i=0 ;i<seq.length ()-1 ;i++) {
214
+ if (seq[i] != seq[i+1 ])
215
+ diffNeighbor++;
216
+ }
217
+ if (diffNeighbor < seq.length ()*0.5 ) {
218
+ if (mOptions ->debug ) {
219
+ cerr << " Skipping " << mPairs .size () << " low complexity reads like: " << seq << endl;
220
+ }
221
+ return NULL ;
222
+ }
223
+ }
224
+ }
225
+
184
226
bool leftReadMode = isLeft;
185
227
// if processing right reads, check if this group is aligned by left
186
228
if (!isLeft) {
187
229
bool leftAligned = true ;
188
230
int lastPos = -1 ;
189
- for (int i=0 ; i<mPairs .size (); i++) {
190
- if (mPairs [i]->mRight ) {
191
- if (lastPos >= 0 && mPairs [i]->mRight ->core .pos != lastPos) {
231
+ for (int i=0 ; i<allPairs .size (); i++) {
232
+ if (allPairs [i]->mRight ) {
233
+ if (lastPos >= 0 && allPairs [i]->mRight ->core .pos != lastPos) {
192
234
leftAligned = false ;
193
235
break ;
194
236
}
195
- lastPos = mPairs [i]->mRight ->core .pos ;
237
+ lastPos = allPairs [i]->mRight ->core .pos ;
196
238
}
197
239
}
198
240
// if it's left aligned, then process them as left reads
199
241
if (leftAligned)
200
242
leftReadMode = true ;
201
243
}
202
244
// first we get a read that is most contained by other reads
203
- vector<int > containedByList (mPairs .size (), 0 );
204
- for (int i=0 ; i<mPairs .size (); i++) {
245
+ vector<int > containedByList (allPairs .size (), 0 );
246
+ for (int i=0 ; i<allPairs .size (); i++) {
205
247
bam1_t * part = NULL ;
206
248
if (isLeft)
207
- part = mPairs [i]->mLeft ;
249
+ part = allPairs [i]->mLeft ;
208
250
else
209
- part = mPairs [i]->mRight ;
251
+ part = allPairs [i]->mRight ;
210
252
if (part == NULL )
211
253
continue ;
212
254
213
255
int containedBy = 1 ;
214
256
215
- for (int j=0 ; j<mPairs .size (); j++) {
257
+ for (int j=0 ; j<allPairs .size (); j++) {
216
258
if (i == j)
217
259
continue ;
218
260
bam1_t * whole = NULL ;
219
261
if (isLeft)
220
- whole = mPairs [j]->mLeft ;
262
+ whole = allPairs [j]->mLeft ;
221
263
else
222
- whole = mPairs [j]->mRight ;
264
+ whole = allPairs [j]->mRight ;
223
265
if (whole == NULL )
224
266
continue ;
225
267
@@ -248,15 +290,15 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
248
290
int thisLen = 0 ;
249
291
int curLen = 0 ;
250
292
if (isLeft) {
251
- if (mPairs [i]->mLeft )
252
- thisLen = mPairs [i]->mLeft ->core .l_qseq ;
253
- if (mPairs [mostContainedById]->mLeft )
254
- curLen = mPairs [mostContainedById]->mLeft ->core .l_qseq ;
293
+ if (allPairs [i]->mLeft )
294
+ thisLen = allPairs [i]->mLeft ->core .l_qseq ;
295
+ if (allPairs [mostContainedById]->mLeft )
296
+ curLen = allPairs [mostContainedById]->mLeft ->core .l_qseq ;
255
297
} else {
256
- if (mPairs [i]->mRight )
257
- thisLen = mPairs [i]->mRight ->core .l_qseq ;
258
- if (mPairs [mostContainedById]->mRight )
259
- curLen = mPairs [mostContainedById]->mRight ->core .l_qseq ;
298
+ if (allPairs [i]->mRight )
299
+ thisLen = allPairs [i]->mRight ->core .l_qseq ;
300
+ if (allPairs [mostContainedById]->mRight )
301
+ curLen = allPairs [mostContainedById]->mRight ->core .l_qseq ;
260
302
}
261
303
if (thisLen < curLen) {
262
304
mostContainedByNum = containedByList[i];
@@ -266,22 +308,23 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
266
308
}
267
309
268
310
// no marjority
269
- if (mostContainedByNum < containedByList.size ()*0.4 && containedByList.size () != 1 )
311
+ if (mostContainedByNum < containedByList.size ()*0.4 && containedByList.size () != 1 ) {
270
312
return NULL ;
313
+ }
271
314
272
315
bam1_t * out = NULL ;
273
316
char * outScore = NULL ;
274
317
if (isLeft) {
275
- out = mPairs [mostContainedById]->mLeft ;
276
- outScore = mPairs [mostContainedById]->getLeftScore ();
318
+ out = allPairs [mostContainedById]->mLeft ;
319
+ outScore = allPairs [mostContainedById]->getLeftScore ();
277
320
// make it null so that it will not be deleted
278
- mPairs [mostContainedById]->mLeft = NULL ;
321
+ allPairs [mostContainedById]->mLeft = NULL ;
279
322
}
280
323
else {
281
- out = mPairs [mostContainedById]->mRight ;
282
- outScore = mPairs [mostContainedById]->getRightScore ();
324
+ out = allPairs [mostContainedById]->mRight ;
325
+ outScore = allPairs [mostContainedById]->getRightScore ();
283
326
// make it null so that it will not be deleted
284
- mPairs [mostContainedById]->mRight = NULL ;
327
+ allPairs [mostContainedById]->mRight = NULL ;
285
328
}
286
329
287
330
if (out == NULL ) {
@@ -294,18 +337,18 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
294
337
reads.push_back (out);
295
338
scores.push_back (outScore);
296
339
297
- for (int j=0 ; j<mPairs .size (); j++) {
340
+ for (int j=0 ; j<allPairs .size (); j++) {
298
341
if (mostContainedById == j)
299
342
continue ;
300
343
bam1_t * read = NULL ;
301
344
char * score = NULL ;
302
345
if (isLeft) {
303
- read = mPairs [j]->mLeft ;
304
- score = mPairs [j]->getLeftScore ();
346
+ read = allPairs [j]->mLeft ;
347
+ score = allPairs [j]->getLeftScore ();
305
348
}
306
349
else {
307
- read = mPairs [j]->mRight ;
308
- score = mPairs [j]->getRightScore ();
350
+ read = allPairs [j]->mRight ;
351
+ score = allPairs [j]->getRightScore ();
309
352
}
310
353
if (read == NULL || score == NULL )
311
354
continue ;
@@ -322,32 +365,6 @@ bam1_t* Cluster::consensusMergeBam(bool isLeft, int& diff) {
322
365
return NULL ;
323
366
}
324
367
325
- // if the sequences are right ones of pairs, we check whether it a completely a chaos
326
- /* if(!isLeft) {
327
- int bothEndNotAligned = 0;
328
- for(int r=0; r<reads.size(); r++) {
329
- // left aligned
330
- if(reads[r]->core.pos == out->core.pos && BamUtil::isPartOf(out, reads[r], true))
331
- continue;
332
- // right aligned
333
- if(reads[r]->core.pos + bam_cigar2rlen(reads[r]->core.n_cigar, (uint32_t *)bam_get_cigar(reads[r])) == out->core.pos + bam_cigar2rlen(out->core.n_cigar, (uint32_t *)bam_get_cigar(out)))
334
- continue;
335
-
336
- // both not aligned
337
- bothEndNotAligned++;
338
- }
339
-
340
- if(bothEndNotAligned*2 >= reads.size()) {
341
- cerr << "Chaos of " << reads.size() << " reads: " << BamUtil::getQName(out) << endl;
342
- for(int r=0; r<reads.size(); r++) {
343
- cerr << reads[r]->core.pos << "," << reads[r]->core.mpos << "," << reads[r]->core.isize << "," << reads[r]->core.l_qseq << "," << BamUtil::getCigar(reads[r]) << endl;
344
- }
345
- bam_destroy1(out);
346
- out = NULL;
347
- return NULL;
348
- }
349
- }*/
350
-
351
368
diff = makeConsensus (reads, out, scores, leftReadMode);
352
369
353
370
return out;
@@ -615,36 +632,20 @@ void Cluster::addRead(bam1_t* b) {
615
632
if (b->core .isize > 0 ) {
616
633
Pair* p = new Pair (mOptions );
617
634
p->setLeft (b);
618
- mPairs . push_back (p) ;
635
+ mPairs [p-> getQName ()]=p ;
619
636
return ;
620
637
}
621
- // right or unproper paired
622
- bool found = false ;
623
- string qname = BamUtil::getQName (b);
624
- for (int i=0 ; i<mPairs .size (); i++) {
625
- Pair* p = mPairs [i];
626
- if (p->mLeft && !p->mRight ) {
627
- if (p->getQName () == qname) {
628
- p->setRight (b);
629
- found = true ;
630
- break ;
631
- }
632
- } else if (p->mRight && !p->mLeft ) {
633
- if (p->getQName () == qname) {
634
- p->setLeft (b);
635
- found = true ;
636
- break ;
637
- }
638
- }
639
- }
640
638
641
- if (!found) {
639
+ string qname = BamUtil::getQName (b);
640
+ if (mPairs .count (qname) > 0 )
641
+ mPairs [qname]->setRight (b);
642
+ else {
642
643
Pair* p = new Pair (mOptions );
643
644
if (b->core .isize < 0 )
644
645
p->setRight (b);
645
646
else
646
647
p->setLeft (b);
647
- mPairs . push_back (p) ;
648
+ mPairs [qname]=p ;
648
649
}
649
650
}
650
651
0 commit comments