Mercurial > repos > bgruening > text_processing

--- a/replace_text_in_column.xml	Tue Feb 27 17:10:53 2018 -0500
+++ b/replace_text_in_column.xml	Tue Jan 01 06:01:13 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="tp_replace_in_column" name="Replace Text" version="@BASE_VERSION@.2">
+<tool id="tp_replace_in_column" name="Replace Text" version="@BASE_VERSION@.3">
     <description>in a specific column</description>
     <macros>
         <import>macros.xml</import>
@@ -13,41 +13,62 @@
             -v OFS="\t"
             -v FS="\t"
             --re-interval
-            --sandbox '{ \$$column = gensub( /$find_pattern/, "$replace_pattern", "g", \$$column ) ; print \$0 ; }'
+            --sandbox
+            '{
+            #for $replacement in $replacements:
+                \$$replacement.column = gensub( /$replacement.find_pattern/, "$replacement.replace_pattern", "g", \$$replacement.column ) ;
+            #end for
+            print \$0 ; }'
             "$infile"
         > "$outfile"
 ]]>
     </command>
     <inputs>
         <param format="tabular" name="infile" type="data" label="File to process" />
-        <param name="column" label="in column" type="data_column" data_ref="infile" accept_default="true" />
+        <repeat name="replacements" title="Replacement" min="1">
+            <param name="column" label="in column" type="data_column" data_ref="infile" accept_default="true" />

-        <param name="find_pattern" type="text" label="Find pattern" help="Use simple text, or a valid regular expression (without backslashes // ) " >
-            <sanitizer>
-                <valid initial="string.printable">
-                    <remove value="&apos;"/>
-                </valid>
-            </sanitizer>
-        </param>
-        <param name="replace_pattern" type="text" label="Replace with" help="Use simple text, or &amp; (ampersand) and \\1 \\2 \\3 to refer to matched text. See examples below." >
-            <sanitizer>
-                <valid initial="string.printable">
-                    <remove value="&apos;"/>
-                </valid>
-            </sanitizer>
-        </param>
+            <param name="find_pattern" type="text" label="Find pattern" help="Use simple text, or a valid regular expression (without backslashes // ) " >
+                <sanitizer>
+                    <valid initial="string.printable">
+                        <remove value="&apos;"/>
+                    </valid>
+                </sanitizer>
+            </param>
+            <param name="replace_pattern" type="text" label="Replace with" help="Use simple text, or &amp; (ampersand) and \\1 \\2 \\3 to refer to matched text. See examples below." >
+                <sanitizer>
+                    <valid initial="string.printable">
+                        <remove value="&apos;"/>
+                    </valid>
+                </sanitizer>
+            </param>
+        </repeat>
     </inputs>
     <outputs>
         <data name="outfile" format_source="infile" metadata_source="infile" />
     </outputs>
     <tests>
-          <test>
+        <test>
               <param name="infile" value="replace_text_in_column1.txt" ftype="tabular" />
               <param name="column" value="4" />
               <param name="find_pattern" value=".+_(R.)" />
               <param name="replace_pattern" value="\\1" />
               <output name="outfile" file="replace_text_in_column_results1.txt" />
         </test>
+        <test>
+              <param name="infile" value="replace_text_in_column1.txt" ftype="tabular" />
+              <repeat name="replacements">
+                <param name="column" value="1" />
+                <param name="find_pattern" value="[a-z]{2}([a-z])" />
+                <param name="replace_pattern" value="\\1" />
+              </repeat>
+              <repeat name="replacements">
+                <param name="column" value="4" />
+                <param name="find_pattern" value=".+_(R.)" />
+                <param name="replace_pattern" value="\\1" />
+              </repeat>
+              <output name="outfile" file="replace_text_in_column_results2.txt" />
+        </test>
     </tests>
     <help>
 <![CDATA[
--- a/replace_text_in_line.xml	Tue Feb 27 17:10:53 2018 -0500
+++ b/replace_text_in_line.xml	Tue Jan 01 06:01:13 2019 -0500
@@ -1,7 +1,7 @@
-<tool id="tp_replace_in_line" name="Replace Text" version="@BASE_VERSION@.1">
+<tool id="tp_replace_in_line" name="Replace Text" version="@BASE_VERSION@.2">
     <description>in entire line</description>
     <macros>
-	<import>macros.xml</import>
+       <import>macros.xml</import>
     </macros>
     <requirements>
         <requirement type="package" version="4.4">sed</requirement>
@@ -9,17 +9,21 @@
     <version_command>sed --version | head -n 1</version_command>
     <command>
 <![CDATA[
-	 sed
+   sed
       -r
       --sandbox
-      's/$find_pattern/$replace_pattern/g'
+      #for $replacement in $replacements:
+        -e
+        's/$replacement.find_pattern/$replacement.replace_pattern/g'
+      #end for
       '$infile'
       > '$outfile'
 ]]>

     </command>
     <inputs>
-	<param format="txt" name="infile" type="data" label="File to process" />
+      <param format="txt" name="infile" type="data" label="File to process" />
+      <repeat name="replacements" title="Replacement" min="1">
          <param name="find_pattern" type="text" size="20" label="Find pattern" help="Use simple text, or a valid regular expression (without backslashes // ) " >
             <sanitizer>
                 <valid initial="string.printable">
@@ -31,7 +35,7 @@
                     <add source="/" target="\/"/>
                 </mapping>
             </sanitizer>
-        </param>
+         </param>
          <param name="replace_pattern" type="text" size="20" label="Replace with:" help="Use simple text, or &amp; (ampersand) and \\1 \\2 \\3 to refer to matched text. See examples below." >
             <sanitizer>
                 <valid initial="string.printable">
@@ -46,17 +50,30 @@
             </sanitizer>

         </param>
+      </repeat>
     </inputs>
     <outputs>
-	<data name="outfile" format_source="infile" metadata_source="infile"/>
+  <data name="outfile" format_source="infile" metadata_source="infile"/>
     </outputs>
     <tests>
-	<test>
+         <test>
             <param name="infile" value="replace_text_in_line1.txt" />
             <param name="find_pattern" value="CTC." />
             <param name="replace_pattern" value="FOOBAR" />
             <output name="outfile" file="replace_text_in_line_results1.txt" />
         </test>
+        <test>
+            <param name="infile" value="replace_text_in_line1.txt" />
+            <repeat name="replacements">
+              <param name="find_pattern" value="CTC." />
+              <param name="replace_pattern" value="FOOBAR" />
+            </repeat>
+            <repeat name="replacements">
+              <param name="find_pattern" value="chr" />
+              <param name="replace_pattern" value="domain" />
+            </repeat>
+            <output name="outfile" file="replace_text_in_line_results2.txt" />
+        </test>
     </tests>
     <help>
 <![CDATA[
@@ -77,9 +94,9 @@
 **Examples of Find Patterns**

 - **HELLO**     The word 'HELLO' (case sensitive).
-- **AG.T**	The letters A,G followed by any single character, followed by the letter T.
+- **AG.T**  The letters A,G followed by any single character, followed by the letter T.
 - **A{4,}**     Four or more consecutive A's.
-- **chr2[012]\\t**	 The words 'chr20' or 'chr21' or 'chr22' followed by a tab character.
+- **chr2[012]\\t**   The words 'chr20' or 'chr21' or 'chr22' followed by a tab character.
 - **hsa-mir-([^ ]+)**        The text 'hsa-mir-' followed by one-or-more non-space characters. When using parenthesis, the matched content of the parenthesis can be accessed with **\1** in the **replace** pattern.


@@ -112,7 +129,7 @@

 -----

-**Extened Regular Expression Syntax**
+**Extended Regular Expression Syntax**

 The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace_text_in_column_results2.txt	Tue Jan 01 06:01:13 2019 -0500
@@ -0,0 +1,3 @@
+r7	56632	56652	R6	310	+
+r7	56736	56756	R7	354	+
+r7	56761	56781	R4	220	+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/replace_text_in_line_results2.txt	Tue Jan 01 06:01:13 2019 -0500
@@ -0,0 +1,3 @@
+domain7	56632	56652	D17003_FOOBAR_R6	310	+
+domain7	56736	56756	D17003_FOOBAR_R7	354	+
+domain7	56761	56781	D17003_FOOBAR_R4	220	+