Skip to content

Commit d2d7366

Browse files
committed
Add bioinfoSE 3181
1 parent cdf455f commit d2d7366

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

bioinfoSE_3181/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This was written in response to [this question on the bioinfo stack exchange](https://bioinformatics.stackexchange.com/questions/3181/generate-missing-5utr-and-3utr-information). In short, it'll take a GFF3 that's missing UTR entries and add them.

bioinfoSE_3181/addGFF3UTRs.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env python
2+
import argparse
3+
4+
5+
def parseAttributes(kvps):
6+
d = dict()
7+
for kvp in kvps:
8+
k, v = kvp.split("=")
9+
d[k] = v
10+
return d
11+
12+
13+
def parseGFF3(fname, topLevel="gene", secondLevel="mRNA", CDS="CDS", exon="exon"):
14+
f = open(fname)
15+
genes = dict()
16+
transcripts = dict()
17+
g2t = dict() # A dictionary with gene IDs associated to their transcript IDs
18+
19+
for line in f:
20+
if line.startswith("#"):
21+
continue
22+
cols = line.strip().split("\t")
23+
attribs = parseAttributes(cols[8].split(";"))
24+
ID = attribs["ID"]
25+
26+
if cols[2] == topLevel:
27+
genes[ID] = line
28+
g2t[ID] = []
29+
elif cols[2] == secondLevel:
30+
parent = attribs["Parent"]
31+
g2t[parent].append(ID)
32+
33+
# Generate a list of [line, [exon entries], [CDS entries]]
34+
if ID not in transcripts:
35+
transcripts[ID] = [line, [], []]
36+
else:
37+
transcripts[ID][0] = line
38+
else:
39+
parent = attribs["Parent"]
40+
if parent not in transcripts:
41+
transcripts[parent] = [None, [], []]
42+
if cols[2] == exon:
43+
transcripts[parent][1].append((int(cols[3]), int(cols[4]), line))
44+
elif cols[2] == CDS:
45+
transcripts[parent][2].append((int(cols[3]), int(cols[4]), line))
46+
f.close()
47+
48+
return genes, transcripts, g2t
49+
50+
51+
def findUTRs(transcripts, fivePrime = True):
52+
for k, t in transcripts.items():
53+
# If there are no CDS entries then skip
54+
if len(t[2]) == 0:
55+
continue
56+
# Get the strand ("." will be treated as "+")
57+
strand = t[0].split("\t")[6]
58+
59+
# For 3' UTR, just swap the strand
60+
if not fivePrime:
61+
strand = "+" if strand == "-" else "-"
62+
63+
exons = [(s, e) for s, e, _ in t[1]]
64+
CDSs = [(s, e) for s, e, _ in t[2]]
65+
exons.sort()
66+
CDSs.sort()
67+
UTRs = []
68+
if strand != "-":
69+
final = CDSs[0][0]
70+
for s, e in exons:
71+
if e < final:
72+
UTRs.append((s,e))
73+
elif s < final:
74+
UTRs.append((s, final - 1))
75+
else:
76+
break
77+
else:
78+
final = CDSs[-1][-1]
79+
for s, e in exons:
80+
if e < final:
81+
continue
82+
elif s > final:
83+
UTRs.append((s, e))
84+
else:
85+
UTRs.append((final + 1, e))
86+
t.append(UTRs)
87+
88+
89+
def saveGFF(oname, genes, transcripts, g2t, fivePrime, threePrime):
90+
o = open(oname, "w")
91+
o.write("##gff-version 3\n")
92+
for geneID, geneLine in genes.items():
93+
o.write(geneLine)
94+
# Go through each transcript
95+
for transID in g2t[geneID]:
96+
t = transcripts[transID]
97+
if t[0] is None:
98+
continue
99+
o.write(t[0])
100+
cols = t[0].strip().split("\t")
101+
102+
# Write the exons, then CDS, then 5'UTR, then 3'UTR
103+
for exon in t[1]:
104+
o.write(exon[2])
105+
for CDS in t[2]:
106+
o.write(CDS[2])
107+
for idx, UTR in enumerate(t[3]):
108+
o.write("{}\t{}\t{}\t{}\t{}\t.\t{}\t.\t".format(cols[0], cols[1], fivePrime, UTR[0], UTR[1], cols[6]))
109+
o.write("ID={}.fivePrimeUTR{};Parent={}\n".format(transID, idx, transID))
110+
for idx, UTR in enumerate(t[4]):
111+
o.write("{}\t{}\t{}\t{}\t{}\t.\t{}\t.\t".format(cols[0], cols[1], threePrime, UTR[0], UTR[1], cols[6]))
112+
o.write("ID={}.threePrimeUTR{};Parent={}\n".format(transID, idx, transID))
113+
o.close()
114+
115+
116+
parser = argparse.ArgumentParser(description="Parse a GFF3 file lacking UTR entries and add them in. Note that this program assumes a reasonably formatted GFF3 file")
117+
parser.add_argument("--fiveUTRname", default="five_prime_UTR", help="The label for 5' UTR entries (default: %(default)s)")
118+
parser.add_argument("--threeUTRname", default="three_prime_UTR", help="The label for 3' UTR entries (default: %(default)s)")
119+
parser.add_argument("--topLevelID", default="gene", help="The 'type' designating the top-level entry (default: %(default)s)")
120+
parser.add_argument("--secondLevelID", default="mRNA", help="The 'type' designating the second-level entry, typically something like 'mRNA' or 'transcript' (default: %(default)s)")
121+
parser.add_argument("--exonID", default="exon", help="The 'type' designating exons (default: %(default)s)")
122+
parser.add_argument("--CDSID", default="CDS", help="The 'type' designating CDS (default: %(default)s)")
123+
parser.add_argument("input", metavar="input.gff3", help="Input file name")
124+
parser.add_argument("output", metavar="output.gff3", help="Output file name")
125+
args = parser.parse_args()
126+
127+
genes, transcripts, g2t = parseGFF3(args.input, topLevel=args.topLevelID, secondLevel=args.secondLevelID, exon=args.exonID, CDS=args.CDSID)
128+
129+
# Add the UTRs
130+
findUTRs(transcripts)
131+
findUTRs(transcripts, fivePrime=False)
132+
133+
# Write the output
134+
saveGFF(args.output, genes, transcripts, g2t, args.fiveUTRname, args.threeUTRname)

0 commit comments

Comments
 (0)