diff planemo-template/random_lines_two_pass.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents af383638de66
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo-template/random_lines_two_pass.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+#Selects N random lines from a file and outputs to another file, maintaining original line order
+#allows specifying a seed
+#does two passes to determine line offsets/count, and then to output contents
+
+import optparse, random
+
+def get_random_by_subtraction( line_offsets, num_lines ):
+    while len( line_offsets ) > num_lines:
+        del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ]
+    return line_offsets
+
+def get_random_by_sample( line_offsets, num_lines ):
+    line_offsets = random.sample( line_offsets, num_lines )
+    line_offsets.sort()
+    return line_offsets
+
+def get_random( line_offsets, num_lines ):
+    if num_lines > ( len( line_offsets ) / 2 ):
+        return get_random_by_subtraction( line_offsets, num_lines )
+    else:
+        return get_random_by_sample( line_offsets, num_lines )
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-s', '--seed', dest='seed', action='store', type="string", default=None, help='Set the random seed.' )
+    (options, args) = parser.parse_args()
+    
+    assert len( args ) == 3, "Invalid command line specified."
+    
+    input = open( args[0], 'rb' )
+    output = open( args[1], 'wb' )
+    num_lines = int( args[2] )
+    assert num_lines > 0, "You must select at least one line."
+    
+    if options.seed is not None:
+        random.seed( options.seed )
+    
+    #get line offsets
+    line_offsets = []
+    teller = input.tell
+    readliner = input.readline
+    appender = line_offsets.append
+    while True:
+        offset = teller()
+        if readliner():
+            appender( offset )
+        else:
+            break
+    
+    total_lines = len( line_offsets )
+    assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines )
+    
+    #get random line offsets
+    line_offsets = get_random( line_offsets, num_lines )
+    
+    #write out random lines
+    seeker = input.seek
+    writer = output.write
+    for line_offset in line_offsets:
+        seeker( line_offset )
+        writer( readliner() )
+    input.close()
+    output.close()
+    print "Kept %i of %i total lines." % ( num_lines, total_lines )
+    if options.seed is not None:
+        print 'Used random seed of "%s".' % options.seed
+    
+if __name__=="__main__": __main__()