changeset 3:3cd457890ae0 draft

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/galaxy_sequence_utils/fastq_stats commit f2582539542b33240234e8ea6093e25d0aee9b6a
author devteam
date Sat, 30 Sep 2017 13:53:27 -0400
parents 6d40ee910052
children baee4587f013
files fastq_stats.py fastq_stats.xml test-data/fastq_stats_1_out.tabular tool_dependencies.xml
diffstat 4 files changed, 67 insertions(+), 124 deletions(-) [+]
line wrap: on
line diff
--- a/fastq_stats.py	Fri Dec 18 19:30:26 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-#Dan Blankenberg
-import sys
-from galaxy_utils.sequence.fastq import fastqReader, fastqAggregator
-
-VALID_NUCLEOTIDES = [ 'A', 'C', 'G', 'T', 'N' ]
-VALID_COLOR_SPACE = map( str, range( 7 ) ) + [ '.' ]
-SUMMARY_STAT_ORDER = ['read_count', 'min_score', 'max_score', 'sum_score', 'mean_score', 'q1', 'med_score', 'q3', 'iqr', 'left_whisker', 'right_whisker' ]
-
-def main():
-    input_filename = sys.argv[1]
-    output_filename = sys.argv[2]
-    input_type = sys.argv[3] or 'sanger'
-    
-    aggregator = fastqAggregator()
-    num_reads = None
-    fastq_read = None
-    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
-        aggregator.consume_read( fastq_read )
-    out = open( output_filename, 'wb' )
-    valid_nucleotides = VALID_NUCLEOTIDES
-    if fastq_read:
-        if fastq_read.sequence_space == 'base':
-            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' )
-        else:
-            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' )
-            valid_nucleotides = VALID_COLOR_SPACE
-    for i in range( aggregator.get_max_read_length() ):
-        column_stats = aggregator.get_summary_statistics_for_column( i )
-        out.write( '%i\t' % ( i + 1 ) )
-        out.write( '%s\t' * len( SUMMARY_STAT_ORDER ) % tuple( [ column_stats[ key ] for key in SUMMARY_STAT_ORDER ] ) )
-        out.write( '%s\t' % ','.join( map( str, column_stats['outliers'] ) ) )
-        base_counts = aggregator.get_base_counts_for_column( i )
-        for nuc in valid_nucleotides:
-            out.write( "%s\t" % base_counts.get( nuc, 0 ) )
-        extra_nucs = sorted( [ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ] )
-        out.write( "%s\t%s\n" % ( ','.join( extra_nucs ), ','.join( str( base_counts[nuc] ) for nuc in extra_nucs ) ) )
-    out.close()
-    if num_reads is None:
-        print "No valid fastq reads could be processed."
-    else:
-        print "%i fastq reads were processed." % ( num_reads + 1 )
-        print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" )
-        ascii_range = aggregator.get_ascii_range()
-        decimal_range =  aggregator.get_decimal_range()
-        print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
-        print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )
-
-if __name__ == "__main__": main()
--- a/fastq_stats.xml	Fri Dec 18 19:30:26 2015 -0500
+++ b/fastq_stats.xml	Sat Sep 30 13:53:27 2017 -0400
@@ -1,25 +1,27 @@
-<tool id="fastq_stats" name="FASTQ Summary Statistics" version="1.0.0">
-  <description>by column</description>
-  <requirements>
-    <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement>
-  </requirements>
-  <command interpreter="python">fastq_stats.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'</command>
-  <inputs>
-    <param name="input_file" type="data" format="fastqsanger,fastqillumina,fastqsolexa,fastqcssanger" label="FASTQ File"/>
-  </inputs>
-  <outputs>
-    <data name="output_file" format="tabular" />
-  </outputs>
-  <tests>
-    <test>
-      <param name="input_file" value="fastq_stats1.fastq" ftype="fastqsanger" />
-      <output name="output_file" file="fastq_stats_1_out.tabular" />
-    </test>
-  </tests>
-  <help>
+<tool id="fastq_stats" name="FASTQ Summary Statistics" version="1.1.1">
+    <description>by column</description>
+    <requirements>
+        <requirement type="package" version="1.1.1">galaxy_sequence_utils</requirement>
+    </requirements>
+    <command><![CDATA[
+gx-fastq-stats '$input_file' '$output_file' '${input_file.extension[len('fastq'):]}'
+    ]]></command>
+    <inputs>
+        <param name="input_file" type="data" format="fastqsanger,fastqillumina,fastqsolexa,fastqcssanger,fastqsanger.gz,fastqillumina.gz,fastqsolexa.gz,fastqcssanger.gz,fastqsanger.bz2,fastqillumina.bz2,fastqsolexa.bz2,fastqcssanger.bz2" label="FASTQ File"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="tabular" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="fastq_stats1.fastq" ftype="fastqsanger" />
+            <output name="output_file" file="fastq_stats_1_out.tabular" />
+        </test>
+    </tests>
+    <help><![CDATA[
 **What is does**
 
-This tool creates summary statistics on a FASTQ file. 
+This tool creates summary statistics on a FASTQ file.
 
 .. class:: infomark
 
@@ -53,24 +55,19 @@
 For example::
 
   #column   count   min max sum mean    Q1  med Q3  IQR lW  rW  outliers    A_Count C_Count G_Count T_Count N_Count other_bases other_base_count
-  1   14336356    2   33  450600675   31.4306281875   32.0    33.0    33.0    1.0 31  33  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30    4482314 2199633 4425957 3208745 19707       
-  2   14336356    2   34  441135033   30.7703737965   30.0    33.0    33.0    3.0 26  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25   4419184 2170537 4627987 3118567 81      
-  3   14336356    2   34  433659182   30.2489127642   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4310988 2941988 3437467 3645784 129     
-  4   14336356    2   34  433635331   30.2472490917   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4110637 3007028 3671749 3546839 103     
-  5   14336356    2   34  432498583   30.167957813    29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4348275 2935903 3293025 3759029 124     
+  1   14336356    2   33  450600675   31.4306281875   32.0    33.0    33.0    1.0 31  33  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30    4482314 2199633 4425957 3208745 19707
+  2   14336356    2   34  441135033   30.7703737965   30.0    33.0    33.0    3.0 26  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25   4419184 2170537 4627987 3118567 81
+  3   14336356    2   34  433659182   30.2489127642   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4310988 2941988 3437467 3645784 129
+  4   14336356    2   34  433635331   30.2472490917   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4110637 3007028 3671749 3546839 103
+  5   14336356    2   34  432498583   30.167957813    29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4348275 2935903 3293025 3759029 124
 
 -----
 
 .. class:: warningmark
 
 Adapter bases in color space reads are excluded from statistics.
-
-------
-
-  </help>
-  
-  <citations>
-    <citation type="doi">10.1093/bioinformatics/btq281</citation>
-  </citations>
-  
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btq281</citation>
+    </citations>
 </tool>
--- a/test-data/fastq_stats_1_out.tabular	Fri Dec 18 19:30:26 2015 -0500
+++ b/test-data/fastq_stats_1_out.tabular	Sat Sep 30 13:53:27 2017 -0400
@@ -1,37 +1,37 @@
 #column	count	min	max	sum	mean	Q1	med	Q3	IQR	lW	rW	outliers	A_Count	C_Count	G_Count	T_Count	N_Count	other_bases	other_base_count
-1	9	23	34	288	32.0	33.0	33.0	33.0	0.0	33	33	23,34	3	1	4	1	0		
-2	9	28	33	287	31.8888888889	30.5	33.0	33.0	2.5	28	33		3	3	2	1	0		
-3	9	13	34	268	29.7777777778	27.5	33.0	33.5	6.0	27	34	13	5	1	0	3	0		
-4	9	17	33	261	29.0	24.5	33.0	33.0	8.5	17	33		1	2	3	3	0		
-5	9	22	33	269	29.8888888889	26.0	33.0	33.0	7.0	22	33		3	3	3	0	0		
-6	9	22	33	277	30.7777777778	29.0	33.0	33.0	4.0	28	33	22	5	3	0	1	0		
-7	9	21	33	258	28.6666666667	23.0	33.0	33.0	10.0	21	33		4	1	3	1	0		
-8	9	12	33	263	29.2222222222	26.5	33.0	33.0	6.5	21	33	12	2	1	1	5	0		
-9	9	29	33	290	32.2222222222	31.5	33.0	33.0	1.5	30	33	29	3	3	2	1	0		
-10	9	23	33	277	30.7777777778	28.0	33.0	33.0	5.0	23	33		1	4	2	2	0		
-11	9	12	33	245	27.2222222222	21.0	31.0	33.0	12.0	12	33		5	2	1	1	0		
-12	9	13	33	214	23.7777777778	14.0	24.0	33.0	19.0	13	33		2	4	2	1	0		
-13	9	5	33	249	27.6666666667	26.5	31.0	33.0	6.5	24	33	5	2	1	1	5	0		
-14	9	5	33	233	25.8888888889	19.5	33.0	33.0	13.5	5	33		3	3	2	1	0		
-15	9	15	33	251	27.8888888889	22.5	33.0	33.0	10.5	15	33		5	1	1	2	0		
-16	9	23	34	269	29.8888888889	23.5	33.0	33.0	9.5	23	34		3	1	2	3	0		
-17	9	13	34	266	29.5555555556	27.0	33.0	33.0	6.0	21	34	13	2	3	1	3	0		
-18	9	21	34	272	30.2222222222	26.0	33.0	33.0	7.0	21	34		0	5	1	3	0		
-19	9	5	34	244	27.1111111111	24.0	30.0	33.0	9.0	21	34	5	4	4	1	0	0		
-20	9	11	34	241	26.7777777778	17.0	32.0	33.0	16.0	11	34		3	4	2	0	0		
-21	9	13	33	240	26.6666666667	22.5	27.0	33.0	10.5	13	33		1	4	0	4	0		
-22	9	5	33	190	21.1111111111	9.0	21.0	33.0	24.0	5	33		1	4	0	3	1		
-23	9	5	33	205	22.7777777778	14.0	26.0	33.0	19.0	5	33		4	4	1	0	0		
-24	9	5	33	247	27.4444444444	24.5	31.0	33.0	8.5	21	33	5	1	5	1	2	0		
-25	9	11	34	241	26.7777777778	18.5	33.0	33.0	14.5	11	34		3	4	0	2	0		
-26	9	5	33	212	23.5555555556	11.5	31.0	33.0	21.5	5	33		0	6	0	3	0		
-27	9	5	33	227	25.2222222222	20.0	26.0	33.0	13.0	5	33		3	4	1	1	0		
-28	9	21	33	255	28.3333333333	22.5	31.0	33.0	10.5	21	33		2	4	3	0	0		
-29	9	5	33	228	25.3333333333	19.5	30.0	33.0	13.5	5	33		2	4	1	2	0		
-30	9	10	33	213	23.6666666667	13.5	28.0	33.0	19.5	10	33		3	4	2	0	0		
-31	9	5	33	236	26.2222222222	21.0	31.0	33.0	12.0	5	33		1	4	1	3	0		
-32	9	5	33	210	23.3333333333	11.5	29.0	33.0	21.5	5	33		3	3	0	3	0		
-33	9	5	33	183	20.3333333333	8.0	21.0	33.0	25.0	5	33		1	4	2	2	0		
-34	9	5	33	150	16.6666666667	6.0	17.0	25.5	19.5	5	33		3	4	1	1	0		
-35	9	13	33	217	24.1111111111	19.5	24.0	31.0	11.5	13	33		1	4	1	3	0		
-36	9	5	33	195	21.6666666667	11.5	21.0	32.5	21.0	5	33		3	2	1	3	0		
+1	9	23	34	288	32.000000	33.000000	33.000000	33.000000	0.000000	33	33	23,34	3	1	4	1	0		
+2	9	28	33	287	31.888889	30.500000	33.000000	33.000000	2.500000	28	33		3	3	2	1	0		
+3	9	13	34	268	29.777778	27.500000	33.000000	33.500000	6.000000	27	34	13	5	1	0	3	0		
+4	9	17	33	261	29.000000	24.500000	33.000000	33.000000	8.500000	17	33		1	2	3	3	0		
+5	9	22	33	269	29.888889	26.000000	33.000000	33.000000	7.000000	22	33		3	3	3	0	0		
+6	9	22	33	277	30.777778	29.000000	33.000000	33.000000	4.000000	28	33	22	5	3	0	1	0		
+7	9	21	33	258	28.666667	23.000000	33.000000	33.000000	10.000000	21	33		4	1	3	1	0		
+8	9	12	33	263	29.222222	26.500000	33.000000	33.000000	6.500000	21	33	12	2	1	1	5	0		
+9	9	29	33	290	32.222222	31.500000	33.000000	33.000000	1.500000	30	33	29	3	3	2	1	0		
+10	9	23	33	277	30.777778	28.000000	33.000000	33.000000	5.000000	23	33		1	4	2	2	0		
+11	9	12	33	245	27.222222	21.000000	31.000000	33.000000	12.000000	12	33		5	2	1	1	0		
+12	9	13	33	214	23.777778	14.000000	24.000000	33.000000	19.000000	13	33		2	4	2	1	0		
+13	9	5	33	249	27.666667	26.500000	31.000000	33.000000	6.500000	24	33	5	2	1	1	5	0		
+14	9	5	33	233	25.888889	19.500000	33.000000	33.000000	13.500000	5	33		3	3	2	1	0		
+15	9	15	33	251	27.888889	22.500000	33.000000	33.000000	10.500000	15	33		5	1	1	2	0		
+16	9	23	34	269	29.888889	23.500000	33.000000	33.000000	9.500000	23	34		3	1	2	3	0		
+17	9	13	34	266	29.555556	27.000000	33.000000	33.000000	6.000000	21	34	13	2	3	1	3	0		
+18	9	21	34	272	30.222222	26.000000	33.000000	33.000000	7.000000	21	34		0	5	1	3	0		
+19	9	5	34	244	27.111111	24.000000	30.000000	33.000000	9.000000	21	34	5	4	4	1	0	0		
+20	9	11	34	241	26.777778	17.000000	32.000000	33.000000	16.000000	11	34		3	4	2	0	0		
+21	9	13	33	240	26.666667	22.500000	27.000000	33.000000	10.500000	13	33		1	4	0	4	0		
+22	9	5	33	190	21.111111	9.000000	21.000000	33.000000	24.000000	5	33		1	4	0	3	1		
+23	9	5	33	205	22.777778	14.000000	26.000000	33.000000	19.000000	5	33		4	4	1	0	0		
+24	9	5	33	247	27.444444	24.500000	31.000000	33.000000	8.500000	21	33	5	1	5	1	2	0		
+25	9	11	34	241	26.777778	18.500000	33.000000	33.000000	14.500000	11	34		3	4	0	2	0		
+26	9	5	33	212	23.555556	11.500000	31.000000	33.000000	21.500000	5	33		0	6	0	3	0		
+27	9	5	33	227	25.222222	20.000000	26.000000	33.000000	13.000000	5	33		3	4	1	1	0		
+28	9	21	33	255	28.333333	22.500000	31.000000	33.000000	10.500000	21	33		2	4	3	0	0		
+29	9	5	33	228	25.333333	19.500000	30.000000	33.000000	13.500000	5	33		2	4	1	2	0		
+30	9	10	33	213	23.666667	13.500000	28.000000	33.000000	19.500000	10	33		3	4	2	0	0		
+31	9	5	33	236	26.222222	21.000000	31.000000	33.000000	12.000000	5	33		1	4	1	3	0		
+32	9	5	33	210	23.333333	11.500000	29.000000	33.000000	21.500000	5	33		3	3	0	3	0		
+33	9	5	33	183	20.333333	8.000000	21.000000	33.000000	25.000000	5	33		1	4	2	2	0		
+34	9	5	33	150	16.666667	6.000000	17.000000	25.500000	19.500000	5	33		3	4	1	1	0		
+35	9	13	33	217	24.111111	19.500000	24.000000	31.000000	11.500000	13	33		1	4	1	3	0		
+36	9	5	33	195	21.666667	11.500000	21.000000	32.500000	21.000000	5	33		3	2	1	3	0		
--- a/tool_dependencies.xml	Fri Dec 18 19:30:26 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-  <package name="galaxy_sequence_utils" version="1.0.0">
-      <repository changeset_revision="6334612a010e" name="package_galaxy_utils_1_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>