diff fastq_combiner.py @ 0:82f441097692 draft

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:59 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_combiner.py	Thu Jan 23 12:31:59 2014 -0500
@@ -0,0 +1,49 @@
+#Dan Blankenberg
+import sys, os, shutil
+from galaxy_utils.sequence.fastq import fastqWriter, fastqSequencingRead, fastqCombiner, fastqFakeFastaScoreReader
+from galaxy_utils.sequence.fasta import fastaReader, fastaNamedReader
+
+def main():
+    #Read command line arguments
+    fasta_filename = sys.argv[1]
+    fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt?
+    qual_filename = sys.argv[3]
+    qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid
+    output_filename = sys.argv[5]
+    force_quality_encoding = sys.argv[6]
+    if force_quality_encoding == 'None':
+        force_quality_encoding = None
+    
+    format = 'sanger'
+    if fasta_type == 'csfasta' or qual_type == 'qualsolid':
+        format = 'cssanger'
+    elif qual_type == 'qualsolexa':
+        format = 'solexa'
+    elif qual_type == 'qualillumina':
+        format = 'illumina'
+    
+    out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding )
+    if qual_filename == 'None':
+        qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding )
+    else:
+        qual_input = fastaNamedReader( open( qual_filename, 'rb' )  )
+    
+    fastq_combiner = fastqCombiner( format )
+    i = None
+    skip_count = 0
+    for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ):
+        quality = qual_input.get( sequence )
+        if quality:
+            fastq_read = fastq_combiner.combine( sequence, quality )
+            out.write( fastq_read )
+        else:
+            skip_count += 1
+    out.close()
+    if i is None:
+        print "Your file contains no valid FASTA sequences."
+    else:
+        print qual_input.has_data()
+        print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
+
+if __name__ == "__main__":
+    main()