|  | 
|  | 1 | +#!/usr/bin/env python | 
|  | 2 | +import argparse | 
|  | 3 | + | 
|  | 4 | + | 
|  | 5 | +def parseAttributes(kvps): | 
|  | 6 | +    d = dict() | 
|  | 7 | +    for kvp in kvps: | 
|  | 8 | +        k, v = kvp.split("=") | 
|  | 9 | +        d[k] = v | 
|  | 10 | +    return d | 
|  | 11 | + | 
|  | 12 | + | 
|  | 13 | +def parseGFF3(fname, topLevel="gene", secondLevel="mRNA", CDS="CDS", exon="exon"): | 
|  | 14 | +    f = open(fname) | 
|  | 15 | +    genes = dict() | 
|  | 16 | +    transcripts = dict() | 
|  | 17 | +    g2t = dict()  # A dictionary with gene IDs associated to their transcript IDs | 
|  | 18 | + | 
|  | 19 | +    for line in f: | 
|  | 20 | +        if line.startswith("#"): | 
|  | 21 | +            continue | 
|  | 22 | +        cols = line.strip().split("\t") | 
|  | 23 | +        attribs = parseAttributes(cols[8].split(";")) | 
|  | 24 | +        ID = attribs["ID"] | 
|  | 25 | + | 
|  | 26 | +        if cols[2] == topLevel: | 
|  | 27 | +            genes[ID] = line | 
|  | 28 | +            g2t[ID] = [] | 
|  | 29 | +        elif cols[2] == secondLevel: | 
|  | 30 | +            parent = attribs["Parent"] | 
|  | 31 | +            g2t[parent].append(ID) | 
|  | 32 | + | 
|  | 33 | +            # Generate a list of [line, [exon entries], [CDS entries]] | 
|  | 34 | +            if ID not in transcripts: | 
|  | 35 | +                transcripts[ID] = [line, [], []] | 
|  | 36 | +            else: | 
|  | 37 | +                transcripts[ID][0] = line | 
|  | 38 | +        else: | 
|  | 39 | +            parent = attribs["Parent"] | 
|  | 40 | +            if parent not in transcripts: | 
|  | 41 | +                transcripts[parent] = [None, [], []] | 
|  | 42 | +            if cols[2] == exon: | 
|  | 43 | +                transcripts[parent][1].append((int(cols[3]), int(cols[4]), line)) | 
|  | 44 | +            elif cols[2] == CDS: | 
|  | 45 | +                transcripts[parent][2].append((int(cols[3]), int(cols[4]), line)) | 
|  | 46 | +    f.close() | 
|  | 47 | + | 
|  | 48 | +    return genes, transcripts, g2t | 
|  | 49 | + | 
|  | 50 | + | 
|  | 51 | +def findUTRs(transcripts, fivePrime = True): | 
|  | 52 | +    for k, t in transcripts.items(): | 
|  | 53 | +        # If there are no CDS entries then skip | 
|  | 54 | +        if len(t[2]) == 0: | 
|  | 55 | +            continue | 
|  | 56 | +        # Get the strand ("." will be treated as "+") | 
|  | 57 | +        strand = t[0].split("\t")[6] | 
|  | 58 | + | 
|  | 59 | +        # For 3' UTR, just swap the strand | 
|  | 60 | +        if not fivePrime: | 
|  | 61 | +            strand = "+" if strand == "-" else "-" | 
|  | 62 | + | 
|  | 63 | +        exons = [(s, e) for s, e, _ in t[1]] | 
|  | 64 | +        CDSs = [(s, e) for s, e, _ in t[2]] | 
|  | 65 | +        exons.sort() | 
|  | 66 | +        CDSs.sort() | 
|  | 67 | +        UTRs = [] | 
|  | 68 | +        if strand != "-": | 
|  | 69 | +            final = CDSs[0][0] | 
|  | 70 | +            for s, e in exons: | 
|  | 71 | +                if e < final: | 
|  | 72 | +                    UTRs.append((s,e)) | 
|  | 73 | +                elif s < final: | 
|  | 74 | +                    UTRs.append((s, final - 1)) | 
|  | 75 | +                else: | 
|  | 76 | +                    break | 
|  | 77 | +        else: | 
|  | 78 | +            final = CDSs[-1][-1] | 
|  | 79 | +            for s, e in exons: | 
|  | 80 | +                if e < final: | 
|  | 81 | +                    continue | 
|  | 82 | +                elif s > final: | 
|  | 83 | +                    UTRs.append((s, e)) | 
|  | 84 | +                else: | 
|  | 85 | +                    UTRs.append((final + 1, e)) | 
|  | 86 | +        t.append(UTRs) | 
|  | 87 | + | 
|  | 88 | + | 
|  | 89 | +def saveGFF(oname, genes, transcripts, g2t, fivePrime, threePrime): | 
|  | 90 | +    o = open(oname, "w") | 
|  | 91 | +    o.write("##gff-version 3\n") | 
|  | 92 | +    for geneID, geneLine in genes.items(): | 
|  | 93 | +        o.write(geneLine) | 
|  | 94 | +        # Go through each transcript | 
|  | 95 | +        for transID in g2t[geneID]: | 
|  | 96 | +            t = transcripts[transID] | 
|  | 97 | +            if t[0] is None: | 
|  | 98 | +                continue | 
|  | 99 | +            o.write(t[0]) | 
|  | 100 | +            cols = t[0].strip().split("\t") | 
|  | 101 | + | 
|  | 102 | +            # Write the exons, then CDS, then 5'UTR, then 3'UTR | 
|  | 103 | +            for exon in t[1]: | 
|  | 104 | +                o.write(exon[2]) | 
|  | 105 | +            for CDS in t[2]: | 
|  | 106 | +                o.write(CDS[2]) | 
|  | 107 | +            for idx, UTR in enumerate(t[3]): | 
|  | 108 | +                o.write("{}\t{}\t{}\t{}\t{}\t.\t{}\t.\t".format(cols[0], cols[1], fivePrime, UTR[0], UTR[1], cols[6])) | 
|  | 109 | +                o.write("ID={}.fivePrimeUTR{};Parent={}\n".format(transID, idx, transID)) | 
|  | 110 | +            for idx, UTR in enumerate(t[4]): | 
|  | 111 | +                o.write("{}\t{}\t{}\t{}\t{}\t.\t{}\t.\t".format(cols[0], cols[1], threePrime, UTR[0], UTR[1], cols[6])) | 
|  | 112 | +                o.write("ID={}.threePrimeUTR{};Parent={}\n".format(transID, idx, transID)) | 
|  | 113 | +    o.close() | 
|  | 114 | + | 
|  | 115 | + | 
|  | 116 | +parser = argparse.ArgumentParser(description="Parse a GFF3 file lacking UTR entries and add them in. Note that this program assumes a reasonably formatted GFF3 file") | 
|  | 117 | +parser.add_argument("--fiveUTRname", default="five_prime_UTR", help="The label for 5' UTR entries (default: %(default)s)") | 
|  | 118 | +parser.add_argument("--threeUTRname", default="three_prime_UTR", help="The label for 3' UTR entries (default: %(default)s)") | 
|  | 119 | +parser.add_argument("--topLevelID", default="gene", help="The 'type' designating the top-level entry (default: %(default)s)") | 
|  | 120 | +parser.add_argument("--secondLevelID", default="mRNA", help="The 'type' designating the second-level entry, typically something like 'mRNA' or 'transcript' (default: %(default)s)") | 
|  | 121 | +parser.add_argument("--exonID", default="exon", help="The 'type' designating exons (default: %(default)s)") | 
|  | 122 | +parser.add_argument("--CDSID", default="CDS", help="The 'type' designating CDS (default: %(default)s)") | 
|  | 123 | +parser.add_argument("input", metavar="input.gff3", help="Input file name") | 
|  | 124 | +parser.add_argument("output", metavar="output.gff3", help="Output file name") | 
|  | 125 | +args = parser.parse_args() | 
|  | 126 | + | 
|  | 127 | +genes, transcripts, g2t = parseGFF3(args.input, topLevel=args.topLevelID, secondLevel=args.secondLevelID, exon=args.exonID, CDS=args.CDSID) | 
|  | 128 | + | 
|  | 129 | +# Add the UTRs | 
|  | 130 | +findUTRs(transcripts) | 
|  | 131 | +findUTRs(transcripts, fivePrime=False) | 
|  | 132 | + | 
|  | 133 | +# Write the output | 
|  | 134 | +saveGFF(args.output, genes, transcripts, g2t, args.fiveUTRname, args.threeUTRname) | 
0 commit comments