Mercurial > repos > mingchen0919 > rmarkdown_bdss_client

--- a/bdss_client_sra.Rmd	Sat Oct 14 19:54:57 2017 -0400
+++ b/bdss_client_sra.Rmd	Sat Oct 14 22:59:06 2017 -0400
@@ -38,9 +38,9 @@
 dir.create('pe_read_files_directory')
 # download and extract reads (single end)
 sra_ids_se = strsplit(gsub(',', ' ', 'SRA_IDS_SE'), ' ')[[1]]
-sra_ids_se = sra_ids[sra_ids != '']
+sra_ids_se = sra_ids_se[sra_ids_se != '']
 # loop through SRA accessions to download and extract reads.
-for(id in sra_ids) {
+for(id in sra_ids_se) {
     # build URL from SRA id
     url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/',
                  substr(id, 1, 3), '/',
@@ -50,17 +50,19 @@
     system(bdss_command, intern = TRUE)
     # convert .sra to .fastq/.fasta
     if('FORMAT' == 'fasta') {
-      command = paste0('fastq-dump --fasta -O read_files_directory ', id)
+      command = paste0('fastq-dump --fasta -O se_read_files_directory ', id, '.sra')
     } else {
-      command = paste0('fastq-dump -O read_files_directory ', id)
+      command = paste0('fastq-dump -O se_read_files_directory ', id, '.sra')
     }
+    cat('----convert SRA to fastq/fasta------\n')
+    print(system(command, intern = TRUE))
 }

 # download and extract reads (paired end)
 sra_ids_pe = strsplit(gsub(',', ' ', 'SRA_IDS_PE'), ' ')[[1]]
-sra_ids_pe = sra_ids[sra_ids != '']
+sra_ids_pe = sra_ids_pe[sra_ids_pe != '']
 # loop through SRA accessions to download and extract reads.
-for(id in sra_ids) {
+for(id in sra_ids_pe) {
     # build URL from SRA id
     url = paste0('ftp://ftp.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/',
                  substr(id, 1, 3), '/',
@@ -70,14 +72,29 @@
     system(bdss_command, intern = TRUE)
     # convert .sra to .fastq/.fasta
     if('FORMAT' == 'fasta') {
-      command = paste0('fastq-dump --fasta --split-files -O pe_read_files_directory ', id)
+      command = paste0('fastq-dump --fasta --split-files -O pe_read_files_directory ', id, '.sra')
     } else {
-      command = paste0('fastq-dump --split-files -O pe_read_files_directory ', id)
+      command = paste0('fastq-dump --split-files -O pe_read_files_directory ', id, '.sra')
     }
+    cat('----convert SRA to fastq/fasta------\n')
+    command_stdout = system(command, intern = TRUE)
+    print(command_stdout)
+    if(length(command_stdout) < 3) {
+      # this is not a paired end SRA file. The corresponding file will be deleted.
+      cat(paste0(id, 'is not paired end SRA, the corresponding fastq/fasta file will deleted.'))
+      system(paste0('rm pe_read_files_directory/', id, '_1.*'), intern = TRUE)
+    }
+
 }

+cat('-----single end files----\n')
+list.files('./se_read_files_directory')
+cat('-----paired end files----\n')
+list.files('./pe_read_files_directory')
+
+cat('-----Renaming files------\n')
 # rename files for paired end reads
-old_files = paste0('./read_files_directory/', list.files('./read_files_directory'))
+old_files = paste0('./pe_read_files_directory/', list.files('./pe_read_files_directory'))
 new_files = gsub('_1', '_forward', old_files)
 new_files = gsub('_2', '_reverse', new_files)
 file.rename(old_files, new_files)
--- a/bdss_client_sra.xml	Sat Oct 14 19:54:57 2017 -0400
+++ b/bdss_client_sra.xml	Sat Oct 14 22:59:06 2017 -0400
@@ -24,6 +24,7 @@
             Rscript '${__tool_directory__}/bdss_client_sra_render.R'
                 -i '$sra_ids_se'
                 -p '$sra_ids_pe'
+                -f $format
                 -e $echo

                 -r $report
@@ -46,12 +47,12 @@
     <outputs>
         <data format="html" name="report" label="BDSS client report"/>
         <!--list dataset collection for single end SRA data-->
-        <collection type="list" name="list_collection" label="Fastq-dump (single end reads)">
-            <discover_datasets pattern="__name_and_ext__" directory="read_files_directory" />
+        <collection type="list" name="list_collection" label="BDSS download data (single end reads)">
+            <discover_datasets pattern="__name_and_ext__" directory="se_read_files_directory" />
         </collection>
         <!--list:paired dataset collection for paired end SRA data-->
-        <collection type="list:paired" name="list_collection" label="Fastq-dump (paired end reads)">
-            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.(?P&lt;ext&gt;[^\._]+)?" directory="read_files_directory"/>
+        <collection type="list:paired" name="list:paired_collection" label="BDSS download data (paired end reads)">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.(?P&lt;ext&gt;[^\._]+)?" directory="pe_read_files_directory" />
         </collection>
         <data format="txt" name="sink_message" label="Warnings and Errors" from_work_dir="warnings_and_errors.txt"/>
     </outputs>
--- a/bdss_client_sra_render.R	Sat Oct 14 19:54:57 2017 -0400
+++ b/bdss_client_sra_render.R	Sat Oct 14 22:59:06 2017 -0400
@@ -34,7 +34,8 @@
 ##------- 1. input data ---------------------
 args_list=list()
 args_list$SRA_IDS_SE = c('sra_ids_se', 'i', '1', 'character')
-args_list$SRA_IDS_PE = c('sra_ids_pe', 'i', '1', 'character')
+args_list$SRA_IDS_PE = c('sra_ids_pe', 'p', '1', 'character')
+args_list$FORMAT = c('format', 'f', '1', 'character')
 args_list$ECHO = c('echo', 'e', '1', 'character')
 ##--------2. output report and outputs --------------
 args_list$REPORT_HTML = c('report_html', 'r', '1', 'character')