changeset 0:6201f462adb7 draft

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 561cde8c8bd4a6164b1bef19ecff9809ac3340e0-dirty
author public-health-bioinformatics
date Wed, 09 Jan 2019 15:03:03 -0500
parents
children ba7cee75eb68
files change_fasta_deflines.py change_fasta_deflines.xml test-data/csv_rename_file.csv test-data/fasta_2_rename.fasta test-data/tab_delim_rename_file.txt
diffstat 5 files changed, 141 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_fasta_deflines.py	Wed Jan 09 15:03:03 2019 -0500
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+import sys, argparse
+'''Accepts either csv (default) or tab-delimited files with old/new sequence names, creating a dictionary of 
+respective key:value pairs. Parses an input fasta file for 'old' names, replacing them with 'new' names, writing
+renamed sequences to a fasta file. NOTE: use of tab-delim text file for renaming requires '-t' on cmd line.'''
+#USAGE EXAMPLE 1: python change_fasta_def_lines.py csv_rename_file.csv fasta_2_rename.fasta renamedSequences.fasta
+#USAGE EXAMPLE 2: python change_fasta_def_lines.py tab_delim_rename_file.txt -t fasta_2_rename.fasta renamedSequences.fasta
+
+'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Sept 2017'''
+
+#parse command line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument ("-t", "--tab_delim", help = "name fasta definition lines from tab-delim file", action = "store_true")
+parser.add_argument("inFileHandle") #csv file with current fasta file names in column 1 and desired names in col 2
+parser.add_argument("inFileHandle2") #fasta file containing sequences requiring name replacement
+parser.add_argument("outFileHandle") #user-specified output filename
+args = parser.parse_args()
+
+#open a writable output file that will be over-written if it already exists
+outfile= open(args.outFileHandle,'w')
+dict = {} #dictionary to hold old_name:new_name key:value pairs
+splitter = ',' #default char to split lines at
+#determine if input naming file is csv (default) or tab delim text
+if args.tab_delim:
+    splitter = '\t' #change splitter to tab if comd line args contain '-t'
+
+#create dictionary using key/value pairs from csv file of old/new names
+with open(args.inFileHandle,'r') as inputFile:
+#read in each line and split at comma into key:value pairs
+    for line in inputFile:
+        #remove whitespace from end of lines, split at comma, assigning to key:value pairs
+        line2 = line.rstrip()
+        splitLine = line2.split(splitter)
+        old_name = splitLine[0]
+        new_name = splitLine[1]
+        dict[old_name] = new_name
+
+#parse fasta deflines for 'old' names and, if found, replace with new names
+with open(args.inFileHandle2,'r') as inputFile2:
+    for line in inputFile2:
+        #find the definition lines, remove trailing whitespace & '>'
+        if ">" in line:
+            originalDefline = line.rstrip().replace(">","",1)
+            #check for a match to any of the dict key
+            if dict.has_key(originalDefline):
+                #find the index of that item in the list
+                newDefline= dict[originalDefline]
+                #print("the new name"), newDefline
+                # print each item to make sure the right name is being entered
+                outfile.write(">" + newDefline + "\n")
+            else:
+                #write out the original defline sequence name
+                print ("Defline not in dictionary: "), originalDefline
+                outfile.write(">" + originalDefline + "\n")
+        else:
+        #in lines without ">", write out sequence as it was
+            seq = line.rstrip()
+            outfile.write(seq+"\n")
+
+inputFile.close()
+inputFile2.close()
+outfile.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/change_fasta_deflines.xml	Wed Jan 09 15:03:03 2019 -0500
@@ -0,0 +1,39 @@
+<tool id="change_fasta_deflines" name="Change Fasta Deflines" version="0.0.1">
+  <requirements>
+    <requirement type="package" version="1.7.0">biopython</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+    change_fasta_deflines.py
+    '$key_value_pairs'
+    '$input_fasta'
+    '$output_file'
+    #if $tab_delim
+      -t
+    #end if
+  ]]></command>
+  <inputs>
+    <param name="input_fasta" format="fasta" type="data" />
+    <param name="tab_delim" type="boolean" label="Names file is tab-delimited." checked="false" />
+    <param name="key_value_pairs" format="csv" type="data" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fasta"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_fasta" value="fasta_2_rename.fasta" />
+      <param name="key_value_pairs" value="csv_rename_file.csv" />
+      <output name="output_file" value="output.fasta" />
+    </test>
+    <test>
+      <param name="input_fasta" value="fasta_2_rename.fasta" />
+      <param name="key_value_pairs" value="tab_delim_rename_file.txt" />
+      <param name="tab_delim" value="true" />
+      <output name="output_file" value="output.fasta" />
+    </test>
+  </tests>
+  <help><![CDATA[
+  ]]></help>
+  <citations>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/csv_rename_file.csv	Wed Jan 09 15:03:03 2019 -0500
@@ -0,0 +1,10 @@
+s1,sample_1
+s2,sample_2
+s3,sample_3
+s4,sample_4
+s5,sample_5
+s6,sample_6
+s7,sample_7
+s8,sample_8
+s9,sample_9
+s10,sample_10
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_2_rename.fasta	Wed Jan 09 15:03:03 2019 -0500
@@ -0,0 +1,20 @@
+>s1
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s2
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s3
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s4
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGGNCTLIDALLGDPQCDGFQNKKWDLFVERSRAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGTSSACIRGSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCIALLGFIMWACQKGNIRCNICI
+>s5
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s6
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s7
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s8
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s9
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
+>s10
+QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSRAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGNSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYIEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tab_delim_rename_file.txt	Wed Jan 09 15:03:03 2019 -0500
@@ -0,0 +1,10 @@
+s1	sample_1
+s2	sample_2
+s3	sample_3
+s4	sample_4
+s5	sample_5
+s6	sample_6
+s7	sample_7
+s8	sample_8
+s9	sample_9
+s10	sample_10