# HG changeset patch # User rlegendre # Date 1428593748 14400 # Node ID 7c944fd9907edb31672a698d6a5c43bd4811d887 # Parent ee3a3435ce43358b0a5c5a18cf67368ab7d3e472 release 2 diff -r ee3a3435ce43 -r 7c944fd9907e 313b8f7d2a92 --- a/313b8f7d2a92 Fri Jan 23 03:31:37 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,395 +0,0 @@ - - - - - - - - -ribo_tools: 313b8f7d2a92 - - -
- - -
- - -

changeset 11:313b8f7d2a92 tip

- - - -
(none)
- - - - - - - - - - - - - - - - - - - - - - - - - -
authorrlegendre
dateThu, 22 Jan 2015 14:34:53 +0100
parents707807fee542
children
filesget_codon_frequency.xml kmer_analysis.xml metagene_frameshift_analysis.xml metagene_readthrough.xml
diffstat - 4 files changed, 56 insertions(+), 55 deletions(-) - - [+] - -
- -
-
line wrap: on
-
line diff
-
-
---- a/get_codon_frequency.xml	Thu Jan 22 14:34:38 2015 +0100
-+++ b/get_codon_frequency.xml	Thu Jan 22 14:34:53 2015 +0100
-@@ -1,5 +1,5 @@
- <tool id="Codon_analysis" name="Codon_density">
--	<description> Analyse Ribo-seq alignments between two conditions to extract codon occupancy </description>
-+	<description>To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy </description>
- 	<requirements>
- 	    <requirement type="package">samtools</requirement>
- 	    <requirement type="python-module">matplotlib</requirement>
-@@ -32,7 +32,7 @@
- 	</command>
- 
- 	<inputs>
--		<param name="annot" type="data" label="References Input Annotation File (gff)" format="gff" />
-+		<param name="annot" type="data" label="Reference annotation file (GFF)" format="gff" />
- 		<conditional name="replicat_opt">
- 			<param name="rep" type="select" label="Replicate option">
- 				<option value="yes">Yes (only two replicates by condition)</option>
-@@ -40,23 +40,23 @@
- 			</param>
- 			<when value="yes">
- 			## Use conditional balise : if rep =yes : 4 files, else 2 files 
--				<param name="file1" type="data" label="First replicate of first condition (bam file)" format="bam" />
--				<param name="file11" type="data" label="Second replicate of first condition (bam file)" format="bam" />
--				<param name="file2" type="data" label="First replicate of second condition (bam file)" format="bam" />
--				<param name="file22" type="data" label="First replicate of second condition (bam file)" format="bam" />
-+				<param name="file1" type="data" label="First replicate of first condition (Bam file)" format="bam" />
-+				<param name="file11" type="data" label="Second replicate of first condition (Bam file)" format="bam" />
-+				<param name="file2" type="data" label="First replicate of second condition (Bam file)" format="bam" />
-+				<param name="file22" type="data" label="First replicate of second condition (Bam file)" format="bam" />
- 			</when>
- 			<when value="no">
--				<param name="file1" type="data" label="First bam file" format="bam" />
--				<param name="file2" type="data" label="Second bam File" format="bam" />
-+				<param name="file1" type="data" label="First Bam file" format="bam" />
-+				<param name="file2" type="data" label="Second Bam File" format="bam" />
- 			</when>
- 		</conditional>
--		<param name="site" type="select" label="Please choose a ribosome site for codon analysis">
-+		<param name="site" type="select" label="Please choose a ribosomal site for codon analysis">
- 			<option value="A">A</option>
- 			<option value="P">P</option>
- 			<option value="E">E</option>
- 		</param>
- 		<param name="asite" type="integer" label="Off-set from the 5'end of the footprint to the A-site" value ="15"  />
--		<param name="kmer" type="integer" label="Size of the best phasing reads" value ="28"  />
-+		<param name="kmer" type="integer" label="Lenght of the best phasing footprints" value ="28"  />
- 		<param name="cond1" type="text" size="25" label="Condition one" help="Required even if no replicate" />
- 		<param name="cond2" type="text" size="25" label="Condition two" help="Required even if no replicate" />
- 		<param name="color1" type="text" size="50" label="Color for first condition" value ="SkyBlue" help="Enter standard name, hex color string, or rgb code. You cand find all colors here : http://pythonhosted.org/ete2/reference/reference_svgcolors.html" />
-@@ -74,14 +74,12 @@
- 
- Summary
- -------   
--This tool uses Ribo-seq (bam file) to compare codon translation between two conditions.
--For each footprint, codons at choosen site are saved and an histogram with all normalized codon numbers is plotted in both conditions. 
--A second histogram groups all codons corresponding to an amino acid.
--A chisquare test is used for testing if distribution of used codons is the same in both conditions.
-+This tool uses Ribo-seq (BAM file) to determine whether codon occupancy differs between two sets of conditions. For each footprint, the codons at the chosen site are recorded and a histogram displaying all the normalised codon numbers is plotted for both sets of conditions. A second histogram groups together all the codons corresponding to a given amino acid. A chi-squared test is then carried out to determine whether the distribution of the codons used is the same in both sets of conditions.
-+
- 
- Output 
- ------- 
--This tool provides an html output with graphical outputs and a statistical test result. An additionnal csv file with codon numbers is provided. 
-+This tool provides an html file containing graphs and a statistical test result. An additional csv file with codon numbers is provided. 
- 
- 
- Dependances
---- a/kmer_analysis.xml	Thu Jan 22 14:34:38 2015 +0100
-+++ b/kmer_analysis.xml	Thu Jan 22 14:34:53 2015 +0100
-@@ -1,5 +1,5 @@
- <tool id="kmer_analysis" name="Kmer">
--	<description>Compute proportion of each kmer and phasing</description>
-+	<description>To calculate the proportion and phasing of each kmer</description>
- 	<requirements>
- 		<requirement type="package">samtools</requirement>
- 	    <requirement type="package">numpy</requirement>
-@@ -12,7 +12,7 @@
- 	</command>
- 
- 	<inputs>
--		<param name="gff" type="data" label="References Input Annotation File (gff)" format="gff" />
-+		<param name="gff" type="data" label="Reference annotation file (GFF))" format="gff" />
- 		<param name="bamfile" type="data" label="Bam file" format="bam" />
- 	</inputs>
- 
-@@ -25,11 +25,11 @@
- 	
- Summary
- -------
--This tool uses Ribo-seq data (bam file) to compute proportion of each kmer (lenght of footprints) and phasing. 
-+The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the GFF file.
- 		
- Output 
- ------- 
--This tool provides an html report with all kmer proportion and phasing. 
-+This tool provides an html report detailing the proportions and phasing of the kmers. 
- 
- 
- Dependances
---- a/metagene_frameshift_analysis.xml	Thu Jan 22 14:34:38 2015 +0100
-+++ b/metagene_frameshift_analysis.xml	Thu Jan 22 14:34:53 2015 +0100
-@@ -1,5 +1,5 @@
- <tool id="frameshift_analysis" name="Frame">
--	<description> Analyse Ribo-seq alignment to extract translational ambiguities events</description>
-+	<description>To analyse Ribo-seq alignments for the extraction of translational ambiguities</description>
- 	<requirements>
- 	    <requirement type="package">samtools</requirement>
- 	    <requirement type="python-module">matplotlib</requirement>
-@@ -8,16 +8,18 @@
- 	    <requirement type="python-module">Bio</requirement>
- 	</requirements>
- 	<command interpreter="python"> 
--		metagene_frameshift_analysis.py --input $reference --bam $mapping --cutoff $cutoff --kmer $kmer --fasta $fasta --dirout $output,$output.files_path --box $boxplot> $log
-+		metagene_frameshift_analysis.py --gff $reference --bam $mapping --cutoff $cutoff --kmer $kmer --fasta $fasta --dirout $output,$output.files_path --box $boxplot --orf_length $orf --frame $frame > $log
- 
- 	</command>
- 
- 	<inputs>
--		<param name="reference" type="data" label="References Input Annotation File (gff)" format="gff" />
--		<param name="mapping" type="data" label="Bam Input File" format="bam" />
--		<param name="fasta" type="data" label="Reference in fasta format" format="fasta" />
--		<param name="kmer" type="integer" label="Longer of the best phasing reads" value ="28"  />
-+		<param name="reference" type="data" label="Reference annotation file (GFF)" format="gff" />
-+		<param name="mapping" type="data" label="Bam file" format="bam" />
-+		<param name="fasta" type="data" label="Reference genome in Fasta format" format="fasta" />
-+		<param name="kmer" type="integer" label="Lenght of the best phasing footprints" value ="28"  />
-+		<param name="frame" type="integer" label="Frame where footprints show best phasing. Must be 1, 2 or 3" value ="1"  />
- 		<param name="cutoff" type="integer" label="Cutoff for frame proportion in coding phase (default = 60 %)" value ="60"  />
-+		<param name="orf" type="integer" label="Approximate size of the segment (in bp)" value ="300"  />		
- 	</inputs>
- 
- 	<outputs>
-@@ -30,35 +32,35 @@
- 	<help>
- Summary
- -------    
--This tool uses Ribo-seq data (bam file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofile are plotted for each gene with dual coding events.
-+This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofiles are plotted for each gene with dual coding events.
- 
- 
--*- GFF3 file* : It must contain 9 tabulate-delimited columns : Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID was retrieved in note field by "ID=" tag.
-+*- GFF3 file*: This file must have nine tabulated-delimited columns: Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID is retrieved from the note field, using the "ID=" tag.
- 
--*- Fasta file* : Reference fasta file. Be careful about the chromosome nomenclature used : it must be compatible with your GFF3 annotation file.
-+*- Fasta file*: Reference fasta file. Care should be taken with the chromosome nomenclature used, which must be compatible with the GFF3 annotation file.
- 
--*- BAM file* : It must be sorted. It can contain either multiples or unaligned footprints
-+*- BAM file*: This file should be sorted and may contain either multiple or unaligned footprints
- 
--*- Kmer* : Lenght of the best phasing footprint. You can compute it running kmer_analysis 
-+*- Kmer*: Length of the best-phased footprints. It can be calculated by running kmer_analysis 
-+
-+*- Frame*: Frame for which the phasing of the footprints is best. It can be calculated by running kmer_analysis 
- 				
--*- Cutoff* : Integer value for selecting all genes that have less than 60 % (default) of footprints in coding frame.
-+*- Cutoff*: An integer value for selecting all genes for which fewer than 60 % (default) of the footprints are in the coding frame.
- 
--
--
--.................................................................................................................................................................................................
-+*- Orf*: Approximate size of the segment.
- 
- 
- 
- Output 
- ------- 
--This tool generates 2 output files :
-+This tool generates 3 output files:
- 
--*- html file* : relative to translational ambiguities detection and visualization.
-+*- html file*: for the detection and visualisation of translational ambiguities.
- 
--*- Stat file* : statistiques about treated footprints and phasing.
-+*- Stat file*: this file provides statistics for the treated footprints and phasing.
- 
--*- Boxplot* : Proportion of footprints in the three frames for all genes.
--
-+*- Boxplot*: Proportion of footprints in the three frames, for all genes.
-+		
- 
- Dependances
- ------------
---- a/metagene_readthrough.xml	Thu Jan 22 14:34:38 2015 +0100
-+++ b/metagene_readthrough.xml	Thu Jan 22 14:34:53 2015 +0100
-@@ -1,5 +1,5 @@
- <tool id="readthrough_analysis" name="Stop_supp">
--	<description> Analyse Ribo-seq alignment to detect readthrough events</description>
-+	<description>To analyse Ribo-seq alignments for the detection of stop codon readthrough events</description>
- 	<requirements>
- 	    <requirement type="package">samtools</requirement>
- 	    <requirement type="python-module">HTseq</requirement>
-@@ -8,14 +8,15 @@
- 	    <requirement type="python-module">Bio</requirement>
- 	</requirements>
- 	<command interpreter="python"> 
--		metagene_readthrough.py --gff $gff --fasta $fasta --bam $mapping --dirout=$output,$output.files_path
-+		metagene_readthrough.py --gff $gff --fasta $fasta --bam $mapping --dirout=$output,$output.files_path --extend $ext
- 
- 	</command>
- 
- 	<inputs>
--		<param name="gff" type="data" label="References Input Annotation File (gff)" format="gff" />
--		<param name="fasta" type="data" label="Reference in fasta format" format="fasta" />
--		<param name="mapping" type="data" label="Bam Input File" format="bam" />
-+		<param name="gff" type="data" label="Reference annotation file (GFF)" format="gff" />
-+		<param name="fasta" type="data" label="Reference genome in Fasta format" format="fasta" />
-+		<param name="mapping" type="data" label="Bam File" format="bam" />
-+		<param name="ext" type="integer" label="Length of 3’ UTR extension downstream the annotated stop codon (in bp)" value="300" />
- 	</inputs>
-             
- 	<outputs>
-@@ -25,28 +26,28 @@
- 	<help>
- Summary
- -------          
--This tool uses Ribo-seq data (bam file) to extract potential genes with readthrough events from a reference annotation file (GFF3).
-+This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (GFF3).
- 
--C-terminal protein extensions were identified as previously described (Dunn J.G. and al, 2013). Only uniquely mapped footprints whose size is in the range 25 to 34 are considered.
--A gene is read-though if :
-+C-terminal protein extensions were identified as previously described (Dunn J.G. et al, 2013). Only uniquely mapped footprints with a size in the 25 to 34 range are considered.
-+A gene is considered to display readthrough if:
- 
-  i) It is covered by more than 128 footprints.
- 
-- ii) There are footprints after stop codon.
-+ ii) There are footprints after the stop codon.
- 
-- iii) There are footprints overlapping the next in frame stop codon.
-+ iii) There are footprints overlapping the next in-frame stop codon.
- 
-- iv) There is not Methionine in the next five codons downstream the official stop codon of CDS.
-+ iv) There is no methionine codon in the next five codons downstream from the official stop codon of the CDS.
- 
-- v) The coverage is homogeneous within the extension.
-+ v) Coverage is homogeneous within the extension.
- 
--Stop codon readthrough was estimated by calculating a ratio between footprints in the C-terminal extension and in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). 
--To control variability due to stop codon peaks, footprints mapping to stop codons are excluded to RPKM computing.
-+Stop codon readthrough was estimated by calculating the ratio of the number of footprints in the C-terminal extension to that in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). We controlled for variability due to stop codon peaks, by excluding footprints mapping to stop codons from the calculation of RPKM.
- 
-+		
- Output 
- ------- 
--This tool produces html file with plots for each readthrough gene.
--
-+This tool produces an html file with plots for each gene displaying readthrough.
-+				
- 
- Dependances
- ------------
-
-
- -
-
- - - - - - diff -r ee3a3435ce43 -r 7c944fd9907e get_codon_frequency.py --- a/get_codon_frequency.py Fri Jan 23 03:31:37 2015 -0500 +++ b/get_codon_frequency.py Thu Apr 09 11:35:48 2015 -0400 @@ -28,7 +28,7 @@ import ribo_functions import HTSeq # #libraries for debugg -import pdb +#import pdb # import cPickle def stop_err(msg): @@ -36,70 +36,6 @@ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = OrderedDict({}) - with open(gff, 'r') as f_gff : - # GFF['order'] = [] - for line in f_gff: - # # switch commented lines - head = line.split("#")[0] - if head != "" : - feature = (line.split('\t')[8]).split(';') - chrom = line.split('\t')[0] - if chrom not in GFF : - GFF[chrom] = {} - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=", "") - if re.search('gene', feature[2]) : - Name = feature[2].replace("gene=", "") - else : - Name = "Unknown" - # #get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n", "") - # # store gene information - # GFF['order'].append(gene) - GFF[chrom][gene] = {} - GFF[chrom][gene]['chrom'] = line.split('\t')[0] - GFF[chrom][gene]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) - GFF[chrom][gene]['strand'] = line.split('\t')[6] - GFF[chrom][gene]['name'] = Name - GFF[chrom][gene]['note'] = note - GFF[chrom][gene]['exon'] = {} - GFF[chrom][gene]['exon_number'] = 0 - # print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) - if GFF[chrom].has_key(gene) : - GFF[chrom][gene]['exon_number'] += 1 - exon_number = GFF[chrom][gene]['exon_number'] - GFF[chrom][gene]['exon'][exon_number] = {} - GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - # # if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[chrom][gene]['strand'] == "+" : - GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] - else : - GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] - return GFF - except Exception, e: - stop_err('Error during gff storage : ' + str(e)) - -#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 -#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra -#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr -#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified -#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified -#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified - def init_codon_dict(): @@ -115,13 +51,19 @@ ''' try: codon = init_codon_dict() + multi_tag = "XS:i:" ## bowtie Tag + tag = "IH:i:1" ## RUM tag + for feature in GFF : if feature.type == 'gene' : codon_dict = init_codon_dict() chrom = feature.iv.chrom start = feature.iv.start stop = feature.iv.end - region = chrom + ':' + str(start) + '-' + str(stop) + if start+50 < stop-50 : + region = chrom + ':' + str(start+50) + '-' + str(stop-50) + else : + break ## DEPRECATED #for chrom in GFF.iterkeys(): @@ -143,13 +85,16 @@ multi_tag = "XS:i:" elif 'bwa' in head: multi_tag = "XT:A:R" + elif 'TopHat' in head: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + if len(read) == 0: continue len_read = len(read.split('\t')[9]) # if it's read of good length - if len_read == kmer and multi_tag not in read: + if len_read == kmer and (tag in line or multi_tag not in line): feat = read.split('\t') seq = feat[9] # if it's a reverse read @@ -179,6 +124,7 @@ cod = seq[a_pos-6:a_pos-3] if codon_dict.has_key(cod) : codon_dict[cod] += 1 + del(read) # # add in global dict for cod, count in codon_dict.iteritems() : codon[cod] += count @@ -400,7 +346,7 @@ cond2_aa.append(z[1]) max_valaa.append(max(z)) # # plot amino acid profile : - fig = pl.figure(figsize=(24, 10), num=1) + fig = pl.figure(figsize=(30, 10), num=1) width = .50 ax = fig.add_subplot(111) ax.xaxis.set_ticks([]) @@ -440,8 +386,8 @@ # plot result - fig = pl.figure(figsize=(24, 10), num=1) - width = .50 + fig = pl.figure(figsize=(30, 10), num=1) + width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) @@ -546,7 +492,7 @@ max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result - fig = pl.figure(figsize=(30, 10), num=1) + fig = pl.figure(figsize=(40, 10), num=1) #fig = pl.figure(num=1) width = .40 ind = arange(len(codon_sorted)) @@ -560,7 +506,7 @@ ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) for x, y, z in zip(ind, max_val, codon_sorted): - ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) + ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Codons') handles, labels = ax.get_legend_handles_labels() @@ -657,7 +603,7 @@ parser.add_option("-C", "--cond2", dest="c2", type="string", help="Name of second condition", metavar="STR") - parser.add_option("-k", "--kmer", dest="kmer", type="int", + parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 , help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", @@ -669,19 +615,19 @@ parser.add_option("-d", "--dirout", dest="dirout", type="string", help="write report to PNG files", metavar="FILE") - parser.add_option("-a", "--asite", dest="asite", type="int", - help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") + parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 , + help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT") - parser.add_option("-s", "--site", dest="site", type="string", - help="Script can compute in site A, P or E", metavar="A|P|E") + parser.add_option("-s", "--site", dest="site", type="string", default = "A" , + help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E") - parser.add_option("-r", "--rep", dest="rep", type="string", + parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" , help="if replicate or not", metavar="yes|no") - parser.add_option("-x", "--hex_col1", dest="color1", type= "string", + parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" , help="Color for first condition", metavar="STR") - parser.add_option("-X", "--hex_col2", dest="color2", type= "string", + parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" , help="Color for second condition", metavar="STR") parser.add_option("-q", "--quiet", @@ -702,28 +648,6 @@ if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) - ## identify GFF or GTF format from 9th column - #with open (options.gff,"r") as gffile : - # for line in gffile : - # if '#' in line : - # ## skip header - # gffile.next() - # elif 'gene_id' in line : - # ## launch gtf reader : - # GFF = ribo_functions.store_gtf(options.gff) - # break - # elif 'ID=' in line : - # ## launch gff reader - # GFF = ribo_functions.store_gff(options.gff) - # break - # else : - # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) - - #GFF = store_gff(options.gff) - #GFF = ribo_functions.store_gtf(options.gff) - ## check gff reading - #if not GFF['order'] : - # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list diff -r ee3a3435ce43 -r 7c944fd9907e get_codon_frequency.xml --- a/get_codon_frequency.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/get_codon_frequency.xml Thu Apr 09 11:35:48 2015 -0400 @@ -1,5 +1,5 @@ - To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy + To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy samtools matplotlib @@ -57,8 +57,8 @@ - - + + diff -r ee3a3435ce43 -r 7c944fd9907e kmer_analysis.py --- a/kmer_analysis.py Fri Jan 23 03:31:37 2015 -0500 +++ b/kmer_analysis.py Thu Apr 09 11:35:48 2015 -0400 @@ -80,6 +80,10 @@ ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + ## init kmer dict KMER = OrderedDict({}) @@ -103,10 +107,10 @@ multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" - #elif 'TopHat' in line: - # multi_tag = "NH:i:1" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -121,7 +125,7 @@ read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) #len_read = len(line.split('\t')[9]) - if len_read == kmer and multi_tag not in line: + if len_read == kmer and (tag in line or multi_tag not in line): ###if line.split('\t')[5] == '28M' : total_mapped_read +=1 #if it's a forward read @@ -399,7 +403,7 @@ #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) + stop_err( 'Incorrect GFF file' ) ## split bam split_bam(options.bamfile,tmpdir) @@ -416,12 +420,12 @@ ## compute analysis with other kmer for keys in kmer.iterkeys() : if keys != 28 : - ## remove all txt files in tmp directory - if os.system("rm "+tmpdir+"/*.txt") != 0 : - stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) - + ## If not enought reads in this kmer : if kmer[keys] > 100 : + ## remove all txt files in tmp directory + if os.system("rm "+tmpdir+"/*.txt") != 0 : + stop_err( 'Error during tmp directory cleaning') ## compute coverage and distribution kmer tmp = get_first_base(tmpdir, keys) ## compute phasing diff -r ee3a3435ce43 -r 7c944fd9907e kmer_analysis.xml --- a/kmer_analysis.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/kmer_analysis.xml Thu Apr 09 11:35:48 2015 -0400 @@ -17,7 +17,7 @@ - + @@ -25,8 +25,11 @@ Summary ------- -The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the GFF file. +The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the `GFF3`_ file. + +.. _GFF3: http://gmod.org/wiki/GFF3 + Output ------- This tool provides an html report detailing the proportions and phasing of the kmers. diff -r ee3a3435ce43 -r 7c944fd9907e metagene_frameshift_analysis.py --- a/metagene_frameshift_analysis.py Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_frameshift_analysis.py Thu Apr 09 11:35:48 2015 -0400 @@ -11,10 +11,10 @@ import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv import math from numpy import arange -from matplotlib import pyplot as pl -#import matplotlib -#matplotlib.use('Agg') -#import matplotlib.pyplot as pl +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl import itertools from Bio import SeqIO from Bio.Seq import Seq @@ -87,6 +87,10 @@ write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + try: ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint) p_site_pos = 16-frame @@ -105,13 +109,15 @@ genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif '@PG' in line : + elif '@PG\tID' in line : if 'bowtie' in line: multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -120,7 +126,7 @@ read_sens = int(line.split('\t')[1]) len_read = len(line.split('\t')[9]) read_qual = int(line.split('\t')[4]) - if len_read == kmer and multi_tag not in line and read_qual > 20 : + if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 : ###if line.split('\t')[5] == '28M' : # print line.rstrip() total_mapped_read +=1 @@ -315,7 +321,7 @@ pl.ylabel('Total number of footprints (percent)') pl.draw() pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640) - #pl.savefig(box_file, format='png', dpi=640) + pl.savefig(box_file, format='png', dpi=640) return GFF @@ -760,16 +766,16 @@ parser.add_option("-x", "--boxplot", dest="box", type= "string", help="report boxplot in this file", metavar="STR") - parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", + parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", default = 60 , help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT") - parser.add_option("-k", "--kmer", dest="kmer", type= "int", - help="Longer of your phasing reads", metavar="INT") + parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 , + help="Longer of your phasing reads (Default is 28)", metavar="INT") - parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", - help="Mean of ORF's length for segmentation", metavar="INT") + parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", default = 300 , + help="Mean of ORF's length for segmentation (Default is 300pb, recommended for yeast)", metavar="INT") - parser.add_option("-p", "--frame", dest="frame", type="int", + parser.add_option("-p", "--frame", dest="frame", type="int", default = 1 , help="Script can compute in frame 1, 2 or 3", metavar="1|2|3") parser.add_option("-q", "--quiet", @@ -821,10 +827,10 @@ break else : stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) - #GFF = store_gff(options.gfffile) + ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) + stop_err( 'Incorrect GFF file') # #### cPickles for Test #### #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") : diff -r ee3a3435ce43 -r 7c944fd9907e metagene_frameshift_analysis.xml --- a/metagene_frameshift_analysis.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_frameshift_analysis.xml Thu Apr 09 11:35:48 2015 -0400 @@ -32,7 +32,7 @@ Summary ------- -This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofiles are plotted for each gene with dual coding events. +This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (`GFF3`_). Subprofiles are plotted for each gene with dual coding events. *- GFF3 file*: This file must have nine tabulated-delimited columns: Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID is retrieved from the note field, using the "ID=" tag. @@ -49,8 +49,10 @@ *- Orf*: Approximate size of the segment. + +.. _GFF3: http://gmod.org/wiki/GFF3 - + Output ------- This tool generates 3 output files: diff -r ee3a3435ce43 -r 7c944fd9907e metagene_readthrough.py --- a/metagene_readthrough.py Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_readthrough.py Thu Apr 09 11:35:48 2015 -0400 @@ -665,44 +665,6 @@ stop_err( 'Error during computing analysis : ' + str( e ) ) -def cleaning_bam(bam): - ''' - Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 - ''' - try : - header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) - #header = results.split('\n') - - # check mapper for define multiple tag - if re.search('bowtie', header): - multi_tag = "XS:i:" - elif re.search('bwa', header): - multi_tag = "XT:A:M" - else : - stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") - - tmp_sam = tempfile.mktemp() - cmd = "samtools view %s > %s" % (bam, tmp_sam) - proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) - returncode = proc.wait() - - - with open(tempfile.mktemp(),'w') as out : - out.write(header) - with open(tmp_sam,'r') as sam : - for line in sam : - if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : - if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : - out.write(line) - bamfile = tempfile.mktemp()+'.bam' - cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) - proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) - returncode = proc.wait() - - return bamfile - - except Exception,e: - stop_err( 'Error during cleaning bam : ' + str( e ) ) def write_html_page(html,subfolder) : @@ -1004,8 +966,8 @@ #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) - clean_file = cleaning_bam(options.bamfile) + stop_err( 'Incorrect GFF file' ) + clean_file = ribo_functions.cleaning_bam(options.bamfile) compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) if os.path.exists( clean_file ): os.remove( clean_file ) diff -r ee3a3435ce43 -r 7c944fd9907e metagene_readthrough.xml --- a/metagene_readthrough.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_readthrough.xml Thu Apr 09 11:35:48 2015 -0400 @@ -26,7 +26,7 @@ Summary ------- -This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (GFF3). +This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (`GFF3`_). C-terminal protein extensions were identified as previously described (Dunn J.G. et al, 2013). Only uniquely mapped footprints with a size in the 25 to 34 range are considered. A gene is considered to display readthrough if: @@ -43,6 +43,8 @@ Stop codon readthrough was estimated by calculating the ratio of the number of footprints in the C-terminal extension to that in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). We controlled for variability due to stop codon peaks, by excluding footprints mapping to stop codons from the calculation of RPKM. +.. _GFF3: http://gmod.org/wiki/GFF3 + Output ------- diff -r ee3a3435ce43 -r 7c944fd9907e ribo_functions.py --- a/ribo_functions.py Fri Jan 23 03:31:37 2015 -0500 +++ b/ribo_functions.py Thu Apr 09 11:35:48 2015 -0400 @@ -7,7 +7,7 @@ @license: GPL v3 ''' -import sys, subprocess, re, commands, time, urllib +import sys, subprocess, re, commands, time, urllib, tempfile from copy import copy @@ -127,7 +127,9 @@ try: GFF = {} with open(gff, 'r') as f_gff : + GFF['order'] = [] + #for line in itertools.islice( f_gff, 25 ): for line in f_gff: ## switch commented lines line = line.split("#")[0] @@ -136,13 +138,21 @@ # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") + if 'Name' in line : + regex = re.compile('(Name=)([^;]*);') + res = regex.search(line.split('\t')[8]) + Name = res.group(2) + Name = Name.rstrip() else : Name = "Unknown" ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n","") + if 'Note' in line : + regex = re.compile('(Note=)([^;]*);') + res = regex.search(line.split('\t')[8]) + note = res.group(2) + note = urllib.unquote(str(note)).replace("\n","") + else : + note = "" ## store gene information GFF['order'].append(gene) GFF[gene] = {} @@ -156,7 +166,11 @@ GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) + regex = re.compile('(Parent=)([^;]*);') + res = regex.search(line.split('\t')[8]) + gene = res.group(2) + if 'mRNA' in gene: + gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -166,7 +180,7 @@ GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : + elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : @@ -199,7 +213,7 @@ # first line is already gene line : if 'protein_coding' in line : ##get name - gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip() Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene @@ -245,7 +259,7 @@ return __reverse_coordinates__(GFF) except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) + stop_err( 'Error during gtf storage : ' + str( e ) ) ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; @@ -288,15 +302,19 @@ try : header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) #header = results.split('\n') - + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" # check mapper for define multiple tag - if re.search('bowtie', header): + if 'bowtie' in header: multi_tag = "XS:i:" - elif re.search('bwa', header): + elif 'bwa' in header: multi_tag = "XT:A:R" + elif 'TopHat' in header: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") - + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + tmp_sam = tempfile.mktemp() cmd = "samtools view %s > %s" % (bam, tmp_sam) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) @@ -307,7 +325,7 @@ out.write(header) with open(tmp_sam,'r') as sam : for line in sam : - if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : out.write(line) bamfile = tempfile.mktemp()+'.bam'