").append(x.parseHTML(e)).find(i):e)}).complete(r&&function(e,t){s.each(r,o||[e.responseText,t,e])}),this},x.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){x.fn[t]=function(e){return this.on(t,e)}}),x.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:yn,type:"GET",isLocal:Cn.test(mn[1]),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Dn,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/xml/,html:/html/,json:/json/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":x.parseJSON,"text xml":x.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(e,t){return t?_n(_n(e,x.ajaxSettings),t):_n(x.ajaxSettings,e)},ajaxPrefilter:Hn(An),ajaxTransport:Hn(jn),ajax:function(e,n){"object"==typeof e&&(n=e,e=t),n=n||{};var r,i,o,a,s,l,u,c,p=x.ajaxSetup({},n),f=p.context||p,d=p.context&&(f.nodeType||f.jquery)?x(f):x.event,h=x.Deferred(),g=x.Callbacks("once memory"),m=p.statusCode||{},y={},v={},b=0,w="canceled",C={readyState:0,getResponseHeader:function(e){var t;if(2===b){if(!c){c={};while(t=Tn.exec(a))c[t[1].toLowerCase()]=t[2]}t=c[e.toLowerCase()]}return null==t?null:t},getAllResponseHeaders:function(){return 2===b?a:null},setRequestHeader:function(e,t){var n=e.toLowerCase();return b||(e=v[n]=v[n]||e,y[e]=t),this},overrideMimeType:function(e){return b||(p.mimeType=e),this},statusCode:function(e){var t;if(e)if(2>b)for(t in e)m[t]=[m[t],e[t]];else C.always(e[C.status]);return this},abort:function(e){var t=e||w;return u&&u.abort(t),k(0,t),this}};if(h.promise(C).complete=g.add,C.success=C.done,C.error=C.fail,p.url=((e||p.url||yn)+"").replace(xn,"").replace(kn,mn[1]+"//"),p.type=n.method||n.type||p.method||p.type,p.dataTypes=x.trim(p.dataType||"*").toLowerCase().match(T)||[""],null==p.crossDomain&&(r=En.exec(p.url.toLowerCase()),p.crossDomain=!(!r||r[1]===mn[1]&&r[2]===mn[2]&&(r[3]||("http:"===r[1]?"80":"443"))===(mn[3]||("http:"===mn[1]?"80":"443")))),p.data&&p.processData&&"string"!=typeof p.data&&(p.data=x.param(p.data,p.traditional)),qn(An,p,n,C),2===b)return C;l=p.global,l&&0===x.active++&&x.event.trigger("ajaxStart"),p.type=p.type.toUpperCase(),p.hasContent=!Nn.test(p.type),o=p.url,p.hasContent||(p.data&&(o=p.url+=(bn.test(o)?"&":"?")+p.data,delete p.data),p.cache===!1&&(p.url=wn.test(o)?o.replace(wn,"$1_="+vn++):o+(bn.test(o)?"&":"?")+"_="+vn++)),p.ifModified&&(x.lastModified[o]&&C.setRequestHeader("If-Modified-Since",x.lastModified[o]),x.etag[o]&&C.setRequestHeader("If-None-Match",x.etag[o])),(p.data&&p.hasContent&&p.contentType!==!1||n.contentType)&&C.setRequestHeader("Content-Type",p.contentType),C.setRequestHeader("Accept",p.dataTypes[0]&&p.accepts[p.dataTypes[0]]?p.accepts[p.dataTypes[0]]+("*"!==p.dataTypes[0]?", "+Dn+"; q=0.01":""):p.accepts["*"]);for(i in p.headers)C.setRequestHeader(i,p.headers[i]);if(p.beforeSend&&(p.beforeSend.call(f,C,p)===!1||2===b))return C.abort();w="abort";for(i in{success:1,error:1,complete:1})C[i](p[i]);if(u=qn(jn,p,n,C)){C.readyState=1,l&&d.trigger("ajaxSend",[C,p]),p.async&&p.timeout>0&&(s=setTimeout(function(){C.abort("timeout")},p.timeout));try{b=1,u.send(y,k)}catch(N){if(!(2>b))throw N;k(-1,N)}}else k(-1,"No Transport");function k(e,n,r,i){var c,y,v,w,T,N=n;2!==b&&(b=2,s&&clearTimeout(s),u=t,a=i||"",C.readyState=e>0?4:0,c=e>=200&&300>e||304===e,r&&(w=Mn(p,C,r)),w=On(p,w,C,c),c?(p.ifModified&&(T=C.getResponseHeader("Last-Modified"),T&&(x.lastModified[o]=T),T=C.getResponseHeader("etag"),T&&(x.etag[o]=T)),204===e||"HEAD"===p.type?N="nocontent":304===e?N="notmodified":(N=w.state,y=w.data,v=w.error,c=!v)):(v=N,(e||!N)&&(N="error",0>e&&(e=0))),C.status=e,C.statusText=(n||N)+"",c?h.resolveWith(f,[y,N,C]):h.rejectWith(f,[C,N,v]),C.statusCode(m),m=t,l&&d.trigger(c?"ajaxSuccess":"ajaxError",[C,p,c?y:v]),g.fireWith(f,[C,N]),l&&(d.trigger("ajaxComplete",[C,p]),--x.active||x.event.trigger("ajaxStop")))}return C},getJSON:function(e,t,n){return x.get(e,t,n,"json")},getScript:function(e,n){return x.get(e,t,n,"script")}}),x.each(["get","post"],function(e,n){x[n]=function(e,r,i,o){return x.isFunction(r)&&(o=o||i,i=r,r=t),x.ajax({url:e,type:n,dataType:o,data:r,success:i})}});function Mn(e,n,r){var i,o,a,s,l=e.contents,u=e.dataTypes;while("*"===u[0])u.shift(),o===t&&(o=e.mimeType||n.getResponseHeader("Content-Type"));if(o)for(s in l)if(l[s]&&l[s].test(o)){u.unshift(s);break}if(u[0]in r)a=u[0];else{for(s in r){if(!u[0]||e.converters[s+" "+u[0]]){a=s;break}i||(i=s)}a=a||i}return a?(a!==u[0]&&u.unshift(a),r[a]):t}function On(e,t,n,r){var i,o,a,s,l,u={},c=e.dataTypes.slice();if(c[1])for(a in e.converters)u[a.toLowerCase()]=e.converters[a];o=c.shift();while(o)if(e.responseFields[o]&&(n[e.responseFields[o]]=t),!l&&r&&e.dataFilter&&(t=e.dataFilter(t,e.dataType)),l=o,o=c.shift())if("*"===o)o=l;else if("*"!==l&&l!==o){if(a=u[l+" "+o]||u["* "+o],!a)for(i in u)if(s=i.split(" "),s[1]===o&&(a=u[l+" "+s[0]]||u["* "+s[0]])){a===!0?a=u[i]:u[i]!==!0&&(o=s[0],c.unshift(s[1]));break}if(a!==!0)if(a&&e["throws"])t=a(t);else try{t=a(t)}catch(p){return{state:"parsererror",error:a?p:"No conversion from "+l+" to "+o}}}return{state:"success",data:t}}x.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/(?:java|ecma)script/},converters:{"text script":function(e){return x.globalEval(e),e}}}),x.ajaxPrefilter("script",function(e){e.cache===t&&(e.cache=!1),e.crossDomain&&(e.type="GET",e.global=!1)}),x.ajaxTransport("script",function(e){if(e.crossDomain){var n,r=a.head||x("head")[0]||a.documentElement;return{send:function(t,i){n=a.createElement("script"),n.async=!0,e.scriptCharset&&(n.charset=e.scriptCharset),n.src=e.url,n.onload=n.onreadystatechange=function(e,t){(t||!n.readyState||/loaded|complete/.test(n.readyState))&&(n.onload=n.onreadystatechange=null,n.parentNode&&n.parentNode.removeChild(n),n=null,t||i(200,"success"))},r.insertBefore(n,r.firstChild)},abort:function(){n&&n.onload(t,!0)}}}});var Fn=[],Bn=/(=)\?(?=&|$)|\?\?/;x.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=Fn.pop()||x.expando+"_"+vn++;return this[e]=!0,e}}),x.ajaxPrefilter("json jsonp",function(n,r,i){var o,a,s,l=n.jsonp!==!1&&(Bn.test(n.url)?"url":"string"==typeof n.data&&!(n.contentType||"").indexOf("application/x-www-form-urlencoded")&&Bn.test(n.data)&&"data");return l||"jsonp"===n.dataTypes[0]?(o=n.jsonpCallback=x.isFunction(n.jsonpCallback)?n.jsonpCallback():n.jsonpCallback,l?n[l]=n[l].replace(Bn,"$1"+o):n.jsonp!==!1&&(n.url+=(bn.test(n.url)?"&":"?")+n.jsonp+"="+o),n.converters["script json"]=function(){return s||x.error(o+" was not called"),s[0]},n.dataTypes[0]="json",a=e[o],e[o]=function(){s=arguments},i.always(function(){e[o]=a,n[o]&&(n.jsonpCallback=r.jsonpCallback,Fn.push(o)),s&&x.isFunction(a)&&a(s[0]),s=a=t}),"script"):t});var Pn,Rn,Wn=0,$n=e.ActiveXObject&&function(){var e;for(e in Pn)Pn[e](t,!0)};function In(){try{return new e.XMLHttpRequest}catch(t){}}function zn(){try{return new e.ActiveXObject("Microsoft.XMLHTTP")}catch(t){}}x.ajaxSettings.xhr=e.ActiveXObject?function(){return!this.isLocal&&In()||zn()}:In,Rn=x.ajaxSettings.xhr(),x.support.cors=!!Rn&&"withCredentials"in Rn,Rn=x.support.ajax=!!Rn,Rn&&x.ajaxTransport(function(n){if(!n.crossDomain||x.support.cors){var r;return{send:function(i,o){var a,s,l=n.xhr();if(n.username?l.open(n.type,n.url,n.async,n.username,n.password):l.open(n.type,n.url,n.async),n.xhrFields)for(s in n.xhrFields)l[s]=n.xhrFields[s];n.mimeType&&l.overrideMimeType&&l.overrideMimeType(n.mimeType),n.crossDomain||i["X-Requested-With"]||(i["X-Requested-With"]="XMLHttpRequest");try{for(s in i)l.setRequestHeader(s,i[s])}catch(u){}l.send(n.hasContent&&n.data||null),r=function(e,i){var s,u,c,p;try{if(r&&(i||4===l.readyState))if(r=t,a&&(l.onreadystatechange=x.noop,$n&&delete Pn[a]),i)4!==l.readyState&&l.abort();else{p={},s=l.status,u=l.getAllResponseHeaders(),"string"==typeof l.responseText&&(p.text=l.responseText);try{c=l.statusText}catch(f){c=""}s||!n.isLocal||n.crossDomain?1223===s&&(s=204):s=p.text?200:404}}catch(d){i||o(-1,d)}p&&o(s,c,p,u)},n.async?4===l.readyState?setTimeout(r):(a=++Wn,$n&&(Pn||(Pn={},x(e).unload($n)),Pn[a]=r),l.onreadystatechange=r):r()},abort:function(){r&&r(t,!0)}}}});var Xn,Un,Vn=/^(?:toggle|show|hide)$/,Yn=RegExp("^(?:([+-])=|)("+w+")([a-z%]*)$","i"),Jn=/queueHooks$/,Gn=[nr],Qn={"*":[function(e,t){var n=this.createTween(e,t),r=n.cur(),i=Yn.exec(t),o=i&&i[3]||(x.cssNumber[e]?"":"px"),a=(x.cssNumber[e]||"px"!==o&&+r)&&Yn.exec(x.css(n.elem,e)),s=1,l=20;if(a&&a[3]!==o){o=o||a[3],i=i||[],a=+r||1;do s=s||".5",a/=s,x.style(n.elem,e,a+o);while(s!==(s=n.cur()/r)&&1!==s&&--l)}return i&&(a=n.start=+a||+r||0,n.unit=o,n.end=i[1]?a+(i[1]+1)*i[2]:+i[2]),n}]};function Kn(){return setTimeout(function(){Xn=t}),Xn=x.now()}function Zn(e,t,n){var r,i=(Qn[t]||[]).concat(Qn["*"]),o=0,a=i.length;for(;a>o;o++)if(r=i[o].call(n,t,e))return r}function er(e,t,n){var r,i,o=0,a=Gn.length,s=x.Deferred().always(function(){delete l.elem}),l=function(){if(i)return!1;var t=Xn||Kn(),n=Math.max(0,u.startTime+u.duration-t),r=n/u.duration||0,o=1-r,a=0,l=u.tweens.length;for(;l>a;a++)u.tweens[a].run(o);return s.notifyWith(e,[u,o,n]),1>o&&l?n:(s.resolveWith(e,[u]),!1)},u=s.promise({elem:e,props:x.extend({},t),opts:x.extend(!0,{specialEasing:{}},n),originalProperties:t,originalOptions:n,startTime:Xn||Kn(),duration:n.duration,tweens:[],createTween:function(t,n){var r=x.Tween(e,u.opts,t,n,u.opts.specialEasing[t]||u.opts.easing);return u.tweens.push(r),r},stop:function(t){var n=0,r=t?u.tweens.length:0;if(i)return this;for(i=!0;r>n;n++)u.tweens[n].run(1);return t?s.resolveWith(e,[u,t]):s.rejectWith(e,[u,t]),this}}),c=u.props;for(tr(c,u.opts.specialEasing);a>o;o++)if(r=Gn[o].call(u,e,c,u.opts))return r;return x.map(c,Zn,u),x.isFunction(u.opts.start)&&u.opts.start.call(e,u),x.fx.timer(x.extend(l,{elem:e,anim:u,queue:u.opts.queue})),u.progress(u.opts.progress).done(u.opts.done,u.opts.complete).fail(u.opts.fail).always(u.opts.always)}function tr(e,t){var n,r,i,o,a;for(n in e)if(r=x.camelCase(n),i=t[r],o=e[n],x.isArray(o)&&(i=o[1],o=e[n]=o[0]),n!==r&&(e[r]=o,delete e[n]),a=x.cssHooks[r],a&&"expand"in a){o=a.expand(o),delete e[r];for(n in o)n in e||(e[n]=o[n],t[n]=i)}else t[r]=i}x.Animation=x.extend(er,{tweener:function(e,t){x.isFunction(e)?(t=e,e=["*"]):e=e.split(" ");var n,r=0,i=e.length;for(;i>r;r++)n=e[r],Qn[n]=Qn[n]||[],Qn[n].unshift(t)},prefilter:function(e,t){t?Gn.unshift(e):Gn.push(e)}});function nr(e,t,n){var r,i,o,a,s,l,u=this,c={},p=e.style,f=e.nodeType&&nn(e),d=x._data(e,"fxshow");n.queue||(s=x._queueHooks(e,"fx"),null==s.unqueued&&(s.unqueued=0,l=s.empty.fire,s.empty.fire=function(){s.unqueued||l()}),s.unqueued++,u.always(function(){u.always(function(){s.unqueued--,x.queue(e,"fx").length||s.empty.fire()})})),1===e.nodeType&&("height"in t||"width"in t)&&(n.overflow=[p.overflow,p.overflowX,p.overflowY],"inline"===x.css(e,"display")&&"none"===x.css(e,"float")&&(x.support.inlineBlockNeedsLayout&&"inline"!==ln(e.nodeName)?p.zoom=1:p.display="inline-block")),n.overflow&&(p.overflow="hidden",x.support.shrinkWrapBlocks||u.always(function(){p.overflow=n.overflow[0],p.overflowX=n.overflow[1],p.overflowY=n.overflow[2]}));for(r in t)if(i=t[r],Vn.exec(i)){if(delete t[r],o=o||"toggle"===i,i===(f?"hide":"show"))continue;c[r]=d&&d[r]||x.style(e,r)}if(!x.isEmptyObject(c)){d?"hidden"in d&&(f=d.hidden):d=x._data(e,"fxshow",{}),o&&(d.hidden=!f),f?x(e).show():u.done(function(){x(e).hide()}),u.done(function(){var t;x._removeData(e,"fxshow");for(t in c)x.style(e,t,c[t])});for(r in c)a=Zn(f?d[r]:0,r,u),r in d||(d[r]=a.start,f&&(a.end=a.start,a.start="width"===r||"height"===r?1:0))}}function rr(e,t,n,r,i){return new rr.prototype.init(e,t,n,r,i)}x.Tween=rr,rr.prototype={constructor:rr,init:function(e,t,n,r,i,o){this.elem=e,this.prop=n,this.easing=i||"swing",this.options=t,this.start=this.now=this.cur(),this.end=r,this.unit=o||(x.cssNumber[n]?"":"px")},cur:function(){var e=rr.propHooks[this.prop];return e&&e.get?e.get(this):rr.propHooks._default.get(this)},run:function(e){var t,n=rr.propHooks[this.prop];return this.pos=t=this.options.duration?x.easing[this.easing](e,this.options.duration*e,0,1,this.options.duration):e,this.now=(this.end-this.start)*t+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),n&&n.set?n.set(this):rr.propHooks._default.set(this),this}},rr.prototype.init.prototype=rr.prototype,rr.propHooks={_default:{get:function(e){var t;return null==e.elem[e.prop]||e.elem.style&&null!=e.elem.style[e.prop]?(t=x.css(e.elem,e.prop,""),t&&"auto"!==t?t:0):e.elem[e.prop]},set:function(e){x.fx.step[e.prop]?x.fx.step[e.prop](e):e.elem.style&&(null!=e.elem.style[x.cssProps[e.prop]]||x.cssHooks[e.prop])?x.style(e.elem,e.prop,e.now+e.unit):e.elem[e.prop]=e.now}}},rr.propHooks.scrollTop=rr.propHooks.scrollLeft={set:function(e){e.elem.nodeType&&e.elem.parentNode&&(e.elem[e.prop]=e.now)}},x.each(["toggle","show","hide"],function(e,t){var n=x.fn[t];x.fn[t]=function(e,r,i){return null==e||"boolean"==typeof e?n.apply(this,arguments):this.animate(ir(t,!0),e,r,i)}}),x.fn.extend({fadeTo:function(e,t,n,r){return this.filter(nn).css("opacity",0).show().end().animate({opacity:t},e,n,r)},animate:function(e,t,n,r){var i=x.isEmptyObject(e),o=x.speed(t,n,r),a=function(){var t=er(this,x.extend({},e),o);(i||x._data(this,"finish"))&&t.stop(!0)};return a.finish=a,i||o.queue===!1?this.each(a):this.queue(o.queue,a)},stop:function(e,n,r){var i=function(e){var t=e.stop;delete e.stop,t(r)};return"string"!=typeof e&&(r=n,n=e,e=t),n&&e!==!1&&this.queue(e||"fx",[]),this.each(function(){var t=!0,n=null!=e&&e+"queueHooks",o=x.timers,a=x._data(this);if(n)a[n]&&a[n].stop&&i(a[n]);else for(n in a)a[n]&&a[n].stop&&Jn.test(n)&&i(a[n]);for(n=o.length;n--;)o[n].elem!==this||null!=e&&o[n].queue!==e||(o[n].anim.stop(r),t=!1,o.splice(n,1));(t||!r)&&x.dequeue(this,e)})},finish:function(e){return e!==!1&&(e=e||"fx"),this.each(function(){var t,n=x._data(this),r=n[e+"queue"],i=n[e+"queueHooks"],o=x.timers,a=r?r.length:0;for(n.finish=!0,x.queue(this,e,[]),i&&i.stop&&i.stop.call(this,!0),t=o.length;t--;)o[t].elem===this&&o[t].queue===e&&(o[t].anim.stop(!0),o.splice(t,1));for(t=0;a>t;t++)r[t]&&r[t].finish&&r[t].finish.call(this);delete n.finish})}});function ir(e,t){var n,r={height:e},i=0;for(t=t?1:0;4>i;i+=2-t)n=Zt[i],r["margin"+n]=r["padding"+n]=e;return t&&(r.opacity=r.width=e),r}x.each({slideDown:ir("show"),slideUp:ir("hide"),slideToggle:ir("toggle"),fadeIn:{opacity:"show"},fadeOut:{opacity:"hide"},fadeToggle:{opacity:"toggle"}},function(e,t){x.fn[e]=function(e,n,r){return this.animate(t,e,n,r)}}),x.speed=function(e,t,n){var r=e&&"object"==typeof e?x.extend({},e):{complete:n||!n&&t||x.isFunction(e)&&e,duration:e,easing:n&&t||t&&!x.isFunction(t)&&t};return r.duration=x.fx.off?0:"number"==typeof r.duration?r.duration:r.duration in x.fx.speeds?x.fx.speeds[r.duration]:x.fx.speeds._default,(null==r.queue||r.queue===!0)&&(r.queue="fx"),r.old=r.complete,r.complete=function(){x.isFunction(r.old)&&r.old.call(this),r.queue&&x.dequeue(this,r.queue)},r},x.easing={linear:function(e){return e},swing:function(e){return.5-Math.cos(e*Math.PI)/2}},x.timers=[],x.fx=rr.prototype.init,x.fx.tick=function(){var e,n=x.timers,r=0;for(Xn=x.now();n.length>r;r++)e=n[r],e()||n[r]!==e||n.splice(r--,1);n.length||x.fx.stop(),Xn=t},x.fx.timer=function(e){e()&&x.timers.push(e)&&x.fx.start()},x.fx.interval=13,x.fx.start=function(){Un||(Un=setInterval(x.fx.tick,x.fx.interval))},x.fx.stop=function(){clearInterval(Un),Un=null},x.fx.speeds={slow:600,fast:200,_default:400},x.fx.step={},x.expr&&x.expr.filters&&(x.expr.filters.animated=function(e){return x.grep(x.timers,function(t){return e===t.elem}).length}),x.fn.offset=function(e){if(arguments.length)return e===t?this:this.each(function(t){x.offset.setOffset(this,e,t)});var n,r,o={top:0,left:0},a=this[0],s=a&&a.ownerDocument;if(s)return n=s.documentElement,x.contains(n,a)?(typeof a.getBoundingClientRect!==i&&(o=a.getBoundingClientRect()),r=or(s),{top:o.top+(r.pageYOffset||n.scrollTop)-(n.clientTop||0),left:o.left+(r.pageXOffset||n.scrollLeft)-(n.clientLeft||0)}):o},x.offset={setOffset:function(e,t,n){var r=x.css(e,"position");"static"===r&&(e.style.position="relative");var i=x(e),o=i.offset(),a=x.css(e,"top"),s=x.css(e,"left"),l=("absolute"===r||"fixed"===r)&&x.inArray("auto",[a,s])>-1,u={},c={},p,f;l?(c=i.position(),p=c.top,f=c.left):(p=parseFloat(a)||0,f=parseFloat(s)||0),x.isFunction(t)&&(t=t.call(e,n,o)),null!=t.top&&(u.top=t.top-o.top+p),null!=t.left&&(u.left=t.left-o.left+f),"using"in t?t.using.call(e,u):i.css(u)}},x.fn.extend({position:function(){if(this[0]){var e,t,n={top:0,left:0},r=this[0];return"fixed"===x.css(r,"position")?t=r.getBoundingClientRect():(e=this.offsetParent(),t=this.offset(),x.nodeName(e[0],"html")||(n=e.offset()),n.top+=x.css(e[0],"borderTopWidth",!0),n.left+=x.css(e[0],"borderLeftWidth",!0)),{top:t.top-n.top-x.css(r,"marginTop",!0),left:t.left-n.left-x.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent||s;while(e&&!x.nodeName(e,"html")&&"static"===x.css(e,"position"))e=e.offsetParent;return e||s})}}),x.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(e,n){var r=/Y/.test(n);x.fn[e]=function(i){return x.access(this,function(e,i,o){var a=or(e);return o===t?a?n in a?a[n]:a.document.documentElement[i]:e[i]:(a?a.scrollTo(r?x(a).scrollLeft():o,r?o:x(a).scrollTop()):e[i]=o,t)},e,i,arguments.length,null)}});function or(e){return x.isWindow(e)?e:9===e.nodeType?e.defaultView||e.parentWindow:!1}x.each({Height:"height",Width:"width"},function(e,n){x.each({padding:"inner"+e,content:n,"":"outer"+e},function(r,i){x.fn[i]=function(i,o){var a=arguments.length&&(r||"boolean"!=typeof i),s=r||(i===!0||o===!0?"margin":"border");return x.access(this,function(n,r,i){var o;return x.isWindow(n)?n.document.documentElement["client"+e]:9===n.nodeType?(o=n.documentElement,Math.max(n.body["scroll"+e],o["scroll"+e],n.body["offset"+e],o["offset"+e],o["client"+e])):i===t?x.css(n,r,s):x.style(n,r,i,s)},n,a?i:t,a,null)}})}),x.fn.size=function(){return this.length},x.fn.andSelf=x.fn.addBack,"object"==typeof module&&module&&"object"==typeof module.exports?module.exports=x:(e.jQuery=e.$=x,"function"==typeof define&&define.amd&&define("jquery",[],function(){return x}))})(window);
diff -r 000000000000 -r 3ba5983012cf tool_dependencies.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,173 @@
+
+    
+        
+            
+                 plumbum==1.3.0
+                                                 pysam==0.7.5
+                                                 ftputil==2.2.3 
+             
+         
+     
+ 
+        
+            
+                https://pypi.python.org/packages/source/n/numpy/numpy-1.7.1.tar.gz 
+                $INSTALL_DIR/lib/python 
+                export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin 
+                
+                    $INSTALL_DIR/lib/python 
+                    $INSTALL_DIR/lib/python 
+                    $INSTALL_DIR/bin 
+                 
+             
+         
+     
+        
+        
+            
+                http://downloads.sourceforge.net/project/freetype/freetype2/2.4.11/freetype-2.4.11.tar.bz2 
+                ./configure --prefix=$INSTALL_DIR/freetype/ 
+                make 
+                make install 
+                
+                    $INSTALL_DIR/freetype/bin/ 
+                    $INSTALL_DIR/freetype/lib/ 
+                    $INSTALL_DIR/freetype/include/ 
+                 
+             
+         
+        Compiling freetype requires a C compiler (typically gcc). 
+     
+
+
+        
+            
+                
+                http://downloads.sourceforge.net/project/matplotlib/matplotlib/matplotlib-1.3.0/matplotlib-1.3.0.tar.gz 
+
+                             
+                
+                $INSTALL_DIR/lib/python 
+                echo "### PYTHONPATH = $PYTHONPATH, PYTHONPATH_NUMPY = $PYTHONPATH_NUMPY" 
+                export PYTHONPATH=$INSTALL_DIR/lib/python:$PYTHONPATH_NUMPY:$PYTHONPATH &&
+                                             export CPLUS_INCLUDE_PATH=$FREETYPE_INCLUDE_DIR:$FREETYPE_INCLUDE_DIR/freetype2/:$CPLUS_INCLUDE_PATH && 
+                                             export LIBRARY_PATH=$FREETYPE_LIB_DIR:$LIBRARY_PATH &&  
+                                             python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin 
+                
+                    $INSTALL_DIR/lib/python 
+                    $ENV[PYTHONPATH_NUMPY] 
+                    $INSTALL_DIR/lib/python 
+                 
+             
+         
+        Compiling matplotlib requires a C compiler (typically gcc) and libpng. 
+     
+
+        
+            
+                https://pypi.python.org/packages/source/b/biopython/biopython-1.61.tar.gz 
+                $INSTALL_DIR/lib/python 
+                
+                    export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && 
+                    export PATH=$PATH:$PATH_NUMPY && 
+                    export PYTHONPATH=$PYTHONPATH:$PYTHONPATH_NUMPY && 
+                    python setup.py install --install-lib $INSTALL_DIR/lib/python
+                 
+                
+                    $INSTALL_DIR/lib/python 
+                    $ENV[PYTHONPATH_NUMPY] 
+                    $ENV[PATH_NUMPY] 
+                    $INSTALL_DIR/lib/python 
+                 
+             
+         
+     
+ 
+        
+            
+                https://pypi.python.org/packages/source/H/HTSeq/HTSeq-0.5.4p3.tar.gz 
+   
+                $INSTALL_DIR/lib/python 
+                
+                    export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python && 
+                    export PATH=$PATH:$PATH_NUMPY && 
+                    export PYTHONPATH=$PYTHONPATH:$PYTHONPATH_NUMPY && 
+                    python setup.py install --install-lib $INSTALL_DIR/lib/python --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin
+                 
+                
+                    $INSTALL_DIR/lib/python 
+                    $ENV[PYTHONPATH_NUMPY] 
+                    $ENV[PATH_NUMPY] 
+                    $INSTALL_DIR/lib/python 
+                 
+             
+         
+     
+
+    
+        
+            
+                http://sourceforge.net/projects/samtools/files/samtools/0.1.19/samtools-0.1.19.tar.bz2 
+                make 
+                
+                    samtools 
+                    $INSTALL_DIR/bin 
+                 
+                
+                    $INSTALL_DIR/bin 
+                 
+             
+         
+     
+    
+        
+            
+                http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.4.tar.bz2  
+                make 
+                
+                    bwa 
+                    $INSTALL_DIR/bin 
+                 
+                
+                    $INSTALL_DIR/bin 
+                 
+             
+         
+     
+    
+    
+        
+            
+                https://rna-star.googlecode.com/files/STAR_2.3.0e.tgz 
+                
+                    STAR 
+                    $INSTALL_DIR/bin 
+                 
+                
+                    $INSTALL_DIR/bin 
+                 
+             
+         
+    
+     
+    
+        
+            
+                http://www.bx.psu.edu/miller_lab/dist/lastz-1.02.00.tar.gz 
+                sed -i 's/-Werror//g' src/Makefile 
+                make 
+                
+                         src/lastz 
+                         $INSTALL_DIR/bin 
+                      
+                     
+                         src/lastz_D 
+                         $INSTALL_DIR/bin 
+                      
+                
+                    $INSTALL_DIR/bin 
+                 
+             
+         
+        
+ 
\ No newline at end of file
diff -r 000000000000 -r 3ba5983012cf vhom.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vhom.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,1315 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import logging
+import tempfile
+import shutil
+
+import subprocess
+import bz2
+import re
+import glob
+
+from shutil import rmtree
+from subprocess import PIPE
+from collections import defaultdict, Counter
+
+try:
+    import pysam
+except ImportError:
+    message = 'This script requires the pysam python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    from plumbum import cli
+except ImportError:
+    message = 'This script requires the plumbum python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    from Bio.SeqRecord import SeqRecord
+    from Bio import SeqIO
+    from Bio.Seq import Seq
+    from Bio.Alphabet import IUPAC, Gapped
+except ImportError:
+    message = 'This script requires the BioPython python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    import HTSeq
+except ImportError:
+    message = 'This script requires the HTSeq python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+class CLI(cli.Application):
+
+    """ Homologous region analysis of hit files"""
+    PROGNAME = "vmap"
+    VERSION = "1.0.0"
+    DESCRIPTION = """"""
+    USAGE = """"""
+
+    def main(self, *args):
+
+        if args:
+            print("Unknown command %r" % (args[0]))
+            print self.USAGE
+            return 1
+
+        if not self.nested_command:
+            print("No command given")
+            print self.USAGE
+            return 1
+
+class SequenceProxy:
+
+    def __init__(self, references_path, human_transcripts_path=None):
+
+        self.references_path = references_path
+        self.human_transcripts_path = human_transcripts_path
+
+        self.human_transcript_records = None
+        self.reference_records = None
+        self.hit_records = []
+
+        self.transcript_ranges = None
+
+    def add_hit_file(self, hit_file_path):
+
+        logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path)
+
+        hit_handle = bz2.BZ2File(hit_file_path, 'rb')
+        try:
+            hit_handle.read(10)
+        except IOError:
+            hit_handle = open(hit_file_path, 'rb')
+
+        hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta"))
+
+        logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict))
+
+        self.hit_records.append(hit_dict)
+
+    def get_read_record(self, read_id):
+
+        for record_dict in self.hit_records:
+            if read_id in record_dict:
+                record = record_dict[read_id]
+                return SeqRecord(record.seq, record.id, '', '')
+
+        return None
+
+    def get_reference_record(self, identifier):
+
+        if not self.reference_records:
+            logging.debug('SequenceProxy: indexing reference records')
+            self.reference_records = SeqIO.index(self.references_path, 'fasta')
+
+        try:
+            record = self.reference_records[identifier]
+            return SeqRecord(record.seq, record.id, '', '')
+        except KeyError:
+            return None
+
+    def get_transcript_record(self, identifier):
+
+        if not self.human_transcripts_path:
+            return None
+
+        if not self.human_transcript_records:
+            logging.debug('SequenceProxy: indexing human transcripts')
+            self.human_transcript_records = SeqIO.index(
+                self.human_transcripts_path, 'fasta')
+
+        try:
+            record = self.human_transcript_records[identifier]
+            return SeqRecord(record.seq, record.id, '', '')
+
+        except KeyError:
+            return None
+
+    def get_overlapping_human_transcript_identifiers(self, chromosome, start, end):
+
+        if not self.human_transcripts_path:
+            return set()
+
+        if not self.transcript_ranges:
+            self._set_transcript_ranges()
+
+        interval = HTSeq.GenomicInterval(chromosome, start, end, '.')
+        all_transcript_ids = set()
+        for interval, transcript_ids in self.transcript_ranges[interval].steps():
+            all_transcript_ids = all_transcript_ids.union(transcript_ids)
+
+        return all_transcript_ids
+
+    def _set_transcript_ranges(self):
+
+        logging.debug(
+            'SequenceProxy: Preparing transcript ranges, please wait...')
+
+        self.transcript_ranges = HTSeq.GenomicArrayOfSets(
+            'auto', stranded=False)
+
+        for record in SeqIO.parse(self.human_transcripts_path, 'fasta'):
+            transcript_id = record.id
+            for exon in record.description.strip().split(' ')[1].split('|'):
+                chromosome, start, end = exon.split(';')
+                start, end = sorted((int(start), int(end)))
+                if start == end:
+                    continue
+                interval = HTSeq.GenomicInterval(chromosome, start, end, ".")
+                self.transcript_ranges[interval] += transcript_id
+
+
+class JalviewRunner:
+
+    def __init__(self, jalview_jar_dir, tmp_dir=None):
+
+        self.jalview_jar_dir = jalview_jar_dir
+
+        if tmp_dir:
+            self.tmp_dir = tempfile.mkdtemp(
+                dir=os.path.abspath(os.path.expandvars(tmp_dir)))
+        else:
+            self.tmp_dir = tempfile.mkdtemp()
+
+        self.config = ['#---JalviewX Properties File---',
+                       '#Fri Nov 04 14:23:53 CET 2011',
+                       'FIGURE_AUTOIDWIDTH=true',
+                       'ID_ITALICS=false',
+                       'SORT_ALIGNMENT=No sort',
+                       'SHOW_IDENTITY=true',
+                       'FONT_NAME=SansSerif',
+                       'GAP_SYMBOL=.',
+                       'SHOW_QUALITY=false',
+                       'SHOW_GROUP_CONSERVATION=false',
+                       'FONT_STYLE=plain',
+                       'ANTI_ALIAS=false',
+                       'SORT_BY_TREE=false',
+                       'SHOW_CONSENSUS_HISTOGRAM=false',
+                       'SHOW_OVERVIEW=false',
+                       'DEFAULT_COLOUR=Nucleotide',
+                       'SHOW_CONSENSUS_LOGO=false',
+                       'SHOW_ANNOTATIONS=true',
+                       'SHOW_UNCONSERVED=true',
+                       'AUTO_CALC_CONSENSUS=true',
+                       'PAD_GAPS=true',
+                       'FONT_SIZE=10',
+                       'RIGHT_ALIGN_IDS=false',
+                       'WRAP_ALIGNMENT=false',
+                       'FASTA_JVSUFFIX=false',
+                       'PILEUP_JVSUFFIX=false',
+                       'SHOW_JVSUFFIX=false']
+
+    def _make_configuration(self, sub_tmp_dir):
+
+        config_path = os.path.join(sub_tmp_dir, 'config.txt')
+        logging.debug(
+            'JalviewRunner: writing configuration file %s' % config_path)
+        with open(config_path, 'w') as config_file:
+            for c in self.config:
+                config_file.write(c + '\n')
+
+        return config_path
+
+    def _delete_tmp_dir(self):
+
+        try:
+            rmtree(self.tmp_dir)
+        except OSError:
+            pass
+
+    def _delete_sub_tmp_dir(self, sub_tmp_dir):
+
+        try:
+            rmtree(sub_tmp_dir)
+        except OSError:
+            pass
+
+    def run(self, fasta_path, png_output_path):
+
+        fasta_path = os.path.abspath(os.path.expandvars(fasta_path))
+
+        png_output_path = os.path.abspath(os.path.expandvars(png_output_path))
+
+        sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir)
+        config_path = self._make_configuration(sub_tmp_dir)
+
+        logging.debug(
+            'JalviewRunner: running Jalview on fasta %s with temp dir %s' %
+            (fasta_path, sub_tmp_dir))
+
+        cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir,
+                 '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview',
+                 '-open', fasta_path, '-nodisplay', '-png', png_output_path,
+                 '-colour', 'nucleotide', '-props', config_path]
+
+        # Run sibelia process
+        java_process = subprocess.Popen(
+            ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+
+        # Block until streams are closed by the process
+        stdout, stderr = java_process.communicate()
+
+        if stderr:
+            sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' '))
+
+        self._delete_sub_tmp_dir(sub_tmp_dir)
+
+        return png_output_path
+
+    def __del__(self):
+
+        self._delete_tmp_dir()
+
+
+class LastzRunner:
+
+    def __init__(self, lastz_path=None, word_length=7):
+
+        self.lastz_path = lastz_path
+        self.word_length = word_length
+
+    def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):
+
+        logging.debug('LastzRunner: running on reference %s and query %s' %
+                     (reference_fasta_path, query_fasta_path))
+        output_sam_path = os.path.abspath(
+            os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
+        output_bam_unsorted_path = os.path.abspath(
+            os.path.expandvars(output_bam_path + '.unsorted'))
+
+        logging.debug(
+            'LastzRunner: aligning with output in temporary sam file %s' %
+            output_sam_path)
+        with open(output_sam_path, 'w') as output_sam_handler:
+            for line in self._align(reference_fasta_path, query_fasta_path, multiple):
+                output_sam_handler.write(line)
+
+        logging.debug(
+            'LastzRunner: transforming sam into unsorted bam file %s' %
+            output_bam_unsorted_path)
+        input_sam_handler = pysam.Samfile(output_sam_path, "r")
+        output_bam_file = pysam.Samfile(
+            output_bam_unsorted_path, "wb", template=input_sam_handler)
+
+        logging.debug(
+            'LastzRunner: copying from sam file to bam file')
+        for s in input_sam_handler:
+            output_bam_file.write(s)
+        output_bam_file.close()
+
+        logging.debug('LastzRunner: sorting and indexing bam file %s' %
+                      output_bam_path)
+        pysam.sort(output_bam_unsorted_path,
+                   output_bam_path.replace('.bam', ''))
+
+        pysam.index(output_bam_path)
+
+    def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False):
+
+        logging.debug('LastzRunner: running on reference %s and query %s' %
+                      (reference_fasta_path, query_fasta_path))
+
+        for line in self._align(reference_fasta_path, query_fasta_path, multiple):
+            yield line
+
+    def _align(self, reference_fasta_path, query_fasta_path, multiple):
+
+        reference_fasta_path = os.path.abspath(
+            os.path.expandvars(reference_fasta_path))
+        query_fasta_path = os.path.abspath(
+            os.path.expandvars(query_fasta_path))
+
+        lastz = [self.lastz_path and self.lastz_path or 'lastz']
+        if multiple:
+            cline = lastz + [
+                reference_fasta_path +
+                '[multiple]', query_fasta_path + '[nameparse=darkspace]',
+                '--format=sam', '--strand=both', '--gapped', '--chain',
+                '--nogfextend', '--seed=match%i' % self.word_length]
+
+        else:
+            cline = lastz + [
+                reference_fasta_path, query_fasta_path +
+                '[nameparse=darkspace]',
+                '--format=sam', '--strand=both', '--gapped',
+                '--chain', '--nogfextend', '--seed=match%i' % self.word_length]
+
+        # Run lastz process
+        lastz_process = subprocess.Popen(
+            ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+
+        # Filter multiple read mappings (one to each strand) by only keeping
+        # the one with the most matches
+        alignments = []
+        for i, line in enumerate(iter(lastz_process.stdout.readline, '')):
+            if line.startswith('@'):
+                yield line
+            elif line.lower().startswith('read'):
+
+                fields = line.split('\t')
+                read_name = fields[0]
+                read_cigar = fields[5]
+                if not alignments:
+                    alignments = [read_name, line, read_cigar]
+                    continue
+                else:
+                    if alignments[0] == read_name:
+                        this_mapping = sum(
+                            [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)])
+                        other_mapping = sum(
+                            [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])])
+
+                        if other_mapping > this_mapping:
+                            yield alignments[1]
+                        else:
+                            yield line
+                    else:
+                        yield line
+                    alignments = []
+            else:
+                yield line
+
+
+class ConsensusRunner:
+
+    def __init__(self, ambiguity_cutoff=0.3):
+
+        self.ambiguity_cutoff = ambiguity_cutoff
+        self.ambiguity = {
+            'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S',
+            'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D',
+            'ACGT': 'N'}
+
+        self.alphabet = Gapped(IUPAC.unambiguous_dna, "-")
+
+    def run(self, input_sam_path, output_fasta_path):
+
+        samfile = pysam.Samfile(input_sam_path, "rb")
+        references = samfile.references
+
+        logging.debug(
+            'ConsensusRunner: writing consensus of bam file %s' % input_sam_path)
+
+        assert len(references) == 1
+
+        consensus = defaultdict(list)
+        conditions = {}
+
+        base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5}
+        ambiguity = self.ambiguity
+
+        alphabet = Gapped(IUPAC.ambiguous_dna)
+        frequency_cutoff = self.ambiguity_cutoff
+
+        # Get all conditions of aligned reads
+        for alignedread in samfile.fetch(references[0]):
+
+            name = alignedread.qname
+            length = len(alignedread.seq)
+            condition = None
+
+            if name.lower().startswith('read'):
+                condition = ';'.join(name.split(';')[:2])
+            else:
+                condition = ';'.join(name.split(';')[:-1])
+
+            if condition in conditions:
+                if name not in conditions[condition][0]:
+                    conditions[condition][0].add(name)
+                    conditions[condition][1] += 1
+                    conditions[condition][2] += length
+            else:
+                conditions[condition] = [set([name]), 1, length]
+
+        # Prepare count dictionaries
+        consensus = defaultdict(list)
+
+        for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)):
+
+            if i % 1000 == 0:
+                logging.debug(
+                    'ConsensusRunner: making consensus at position %i' % i)
+
+            counters = {}
+            for condition in conditions:
+                counters[condition] = [0, 0, 0, 0, 0, 0]  # ACGTN-
+
+            for pileupread in pileupcolumn.pileups:
+
+                alignment = pileupread.alignment
+                name = alignment.qname
+
+                if pileupread.is_del:
+                    base = '-'
+                else:
+                    base = alignment.seq[pileupread.qpos].upper()
+
+                if name.lower().startswith('read'):
+                    condition = ';'.join(name.split(';')[:2])
+                    counters[condition][base_mapper[base]] += 1
+                else:
+                    condition = ';'.join(name.split(';')[:-1])
+                    counters[condition][base_mapper[base]] += 1
+
+            for condition, counts in counters.iteritems():
+                depth = float(sum(counts))
+                bases = []
+                a, c, g, t, n, gap = counts
+
+                if depth > 0:
+                    if (n / depth) >= frequency_cutoff:
+                        consensus[condition].append('N')
+                    else:
+                        if (a / depth) >= frequency_cutoff:
+                            bases.append('A')
+                        if (c / depth) >= frequency_cutoff:
+                            bases.append('C')
+                        if (g / depth) >= frequency_cutoff:
+                            bases.append('G')
+                        if (t / depth) >= frequency_cutoff:
+                            bases.append('T')
+
+                if len(bases) > 1:
+                    consensus[condition].append(
+                        ambiguity[''.join(sorted(bases))])
+                elif len(bases) == 1:
+                    consensus[condition].append(bases[0])
+                else:
+                    consensus[condition].append('-')
+
+        # Split consensuses by type
+        reference_consensuses = []
+        read_consensuses = []
+        for condition, sequence_list in consensus.iteritems():
+            if condition.lower().startswith('read'):
+                read_consensuses.append((''.join(sequence_list), condition))
+            else:
+                reference_consensuses.append(
+                    (''.join(sequence_list), condition))
+
+        reference_consensuses.sort(reverse=True)
+        read_consensuses.sort(reverse=True)
+
+        # Get start and end of reads (remove fully gapped positions)
+        start = None
+        end = None
+        for read_consensus, condition in read_consensuses:
+
+            # Forward direction
+            for i, char in enumerate(read_consensus):
+                if char != '-':
+                    if start is None:
+                        start = i
+                    else:
+                        start = min(start, i)
+                    break
+
+            # Reverse direction
+            for i, char in enumerate(read_consensus[::-1]):
+                if char != '-':
+                    if end is None:
+                        end = i
+                    else:
+                        end = min(end, i)
+                    break
+
+        if end == 0 or end is None:
+            end = None
+        else:
+            end = -end
+
+        # Filter all records by start and end of reads
+        reference_consensuses =\
+            [(r[start:end], c) for r, c in reference_consensuses]
+
+        read_consensuses =\
+            [(r[start:end], c) for r, c in read_consensuses]
+
+        # Write consensus records to file
+        with open(output_fasta_path, 'w',  buffering=100 * 1024 * 1024) as output_handler:
+            for consensus, condition in read_consensuses + reference_consensuses:
+                stats = ';'.join(map(str, conditions[condition][1:]))
+                record = SeqRecord(
+                    Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='')
+                SeqIO.write([record], output_handler, "fasta")
+
+
+class Group:
+
+    def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None):
+
+        self.family_name = family_name
+        self.qualified_family_name = qualified_family_name
+        self.sequence_proxy = sequence_proxy
+
+        self.regions = []
+        self.fasta_path = None
+
+        if tmp_dir:
+            self.tmp_dir = tempfile.mkdtemp(
+                dir=os.path.abspath(os.path.expandvars(tmp_dir)))
+        else:
+            self.tmp_dir = tempfile.mkdtemp()
+
+    def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations):
+
+        region = Region(self.sequence_proxy, self.tmp_dir)
+
+        region.add_read(read_id)
+
+        for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations):
+            reference = ';'.join(mapping_location[:-2])
+            region.add_reference(
+                reference, int(mapping_location[-2]), int(mapping_location[-1]))
+
+        self.regions.append(region)
+
+    def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner):
+
+        if not self.regions:
+            logging.debug('Group %s: no regions for family' % self.family_name)
+            return
+
+        made_output_dir = False
+        for i, region in enumerate(self.regions):
+
+            if not made_output_dir:
+                output_dir = os.path.abspath(
+                    os.path.expandvars(os.path.join(output_dir, self.qualified_family_name)))
+                logging.debug('Group %s: writing output files to %s' %
+                              (self.family_name, output_dir))
+                try:
+                    os.makedirs(output_dir)
+                except OSError:
+                    pass
+                made_output_dir = True
+
+            logging.debug('Group %s: writing region number %i files to %s' %
+                          (self.family_name, i + 1, output_dir))
+
+            unaligned_path = os.path.join(
+                output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1))
+            compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb')
+            with open(region.get_unaligned_fasta_path()) as unaligned_fasta:
+                for line in unaligned_fasta:
+                    compressed_unaligned.write(line)
+
+            bam_path = os.path.join(
+                output_dir, 'region_%i_alignment.bam' % (i + 1))
+            temporary_alignment_bam_path = region.get_alignment_bam_path(
+                lastz_runner, assert_record='Read')
+            shutil.copy(temporary_alignment_bam_path, bam_path)
+
+            shutil.copy(
+                temporary_alignment_bam_path + '.bai', bam_path + '.bai')
+
+            consensus_path = os.path.join(
+                output_dir, 'region_%i_consensus.fa' % (i + 1))
+            temporary_consensus_path = region.get_consensus_fasta_path(
+                consensus_builder, temporary_alignment_bam_path)
+            shutil.copy(temporary_consensus_path, consensus_path)
+
+            if jalview_runner:
+
+                jalview_path = os.path.join(
+                    output_dir, 'region_%i_consensus.png' % (i + 1))
+                temporary_jalview_path = region.get_consensus_figure_path(
+                    jalview_runner, temporary_consensus_path)
+                shutil.copy(temporary_jalview_path, jalview_path)
+
+    def _delete_temporary_dir(self):
+
+        self.fasta_path = None
+        try:
+            rmtree(self.tmp_dir)
+        except OSError:
+            pass
+
+    def filter_regions(self, min_region_length=50, min_read_number=5):
+
+        logging.debug('Group %s: filtering %i candidate regions' %
+                      (self.family_name, len(self.regions)))
+
+        filtered = [r for r in self.regions if len(
+            r) >= min_read_number and r.get_longest_reference_length() >= min_region_length]
+
+        ordered = sorted(filtered, reverse=True)
+
+        if ordered:
+            length = ordered[0].length
+        else:
+            length = 0
+
+        logging.debug(
+            'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' %
+            (self.family_name, len(ordered), length))
+
+        self.regions = ordered
+
+        return ordered
+
+    def merge_regions(self, max_gap_length):
+        """ Merges candidate regions into homologous regions. """
+
+        logging.debug('Group %s: merging %i candidate regions' %
+                      (self.family_name, len(self.regions)))
+
+        if len(self.regions) > 1:
+
+            potentially_mergable = self.regions
+            not_mergable = []
+
+            while len(potentially_mergable) > 1:
+
+                merged = False
+                current = potentially_mergable[0]
+                compared_to = potentially_mergable[1:]
+
+                for region in compared_to:
+                    if region.overlaps(current, max_gap_length):
+                        region.merge(current)
+                        region.clean_references(max_gap_length)
+                        #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
+                        potentially_mergable = compared_to
+                        merged = True
+                        break
+
+                if not merged:
+                    not_mergable.append(current)
+                    potentially_mergable = compared_to
+                    #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
+
+            results = not_mergable + potentially_mergable
+
+            logging.debug('Group %s: merged into %i regions' %
+                          (self.family_name, len(results)))
+
+            self.regions = results
+
+        else:
+            logging.debug(
+                'Group %s: found only 1 region, no mergin necessary' % self.family_name)
+
+    def add_transcripts_to_regions(self):
+
+        logging.debug('Group %s: enriching %i regions with human transcript information'
+                      % (self.family_name, len(self.regions)))
+
+        for region in self.regions:
+
+            added_transcripts = 0
+
+            for identifier, locations in region.references.iteritems():
+                chromosome = identifier.split(';')[-1]
+
+                for start, end in locations:
+                    transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers(
+                        chromosome, start, end)
+
+                    for transcript_identifier in transcript_identifiers:
+                        region.add_transcript(transcript_identifier)
+                        added_transcripts += 1
+
+            logging.debug('Group %s: added %i transcripts to a region' %
+                          (self.family_name, added_transcripts))
+
+    def __str__(self):
+
+        s = "Group %s with %i regions" % (self.family_name, len(self.regions))
+
+        return s
+
+
+class GroupGenerator:
+
+    """ Produces Groups from Hitfiles. Reads are assigned to groups based on
+        taxonomic families. Transcripts overlapping human read mappings as well
+        as genome sequences of pathogens the reads map to are added to the
+        Groups.
+    """
+
+    def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None):
+
+        self.sequence_proxy = sequence_proxy
+
+        self.tmp_path = tmp_dir
+        self.reference_database_filter = reference_database_filter
+        self.pathogen_family_filter = pathogen_family_filter
+
+        self.groups = {}
+
+        if tmp_dir:
+            self.tmp_dir = tempfile.mkdtemp(
+                dir=os.path.abspath(os.path.expandvars(tmp_dir)))
+        else:
+            self.tmp_dir = tempfile.mkdtemp()
+
+    def get_groups(self, min_read_number=5):
+
+        logging.debug(
+            'GroupGenerator: generating groups from %i hit files, please wait...' %
+            len(self.sequence_proxy.hit_records))
+
+        for hit_index in self.sequence_proxy.hit_records:
+            for read_id, record in hit_index.iteritems():
+
+                # Extract mapping locations to human DNA, human transcript, and
+                # pathogen genomes
+                mapping_locations = record.description.strip().split(
+                    ' ')[1].split('|')
+
+                pathogen_families = defaultdict(set)
+                human_mapping_locations = set()
+
+                for mapping_location in mapping_locations:
+                    fields = mapping_location.split(';')
+
+                    # Convert start and end to integers
+                    fields[-2] = int(fields[-2])  # start
+                    fields[-1] = int(fields[-1])  # end
+
+                    # Add mapping locations to human cDNA or human DNA
+                    if fields[2] == 'Homo_sapiens':
+                        human_mapping_locations.add(tuple(fields))
+
+                    # Add mapping locations to non-human references
+                    else:
+                        reference_database, family = fields[:2]
+                        if self.reference_database_filter and reference_database\
+                                not in self.reference_database_filter:
+                            continue
+                        if self.pathogen_family_filter and family\
+                                not in self.pathogen_family_filter:
+                            continue
+                        pathogen_families[
+                            (reference_database, family)].add(tuple(fields))
+
+                if not pathogen_families:
+                    continue
+
+                # Process the hits to different pathogen families
+                for (reference_database, family), pathogen_locations in pathogen_families.iteritems():
+
+                    qualified_family_name = reference_database + '_' + family
+
+                    # Obtain existing group or make new group
+                    if qualified_family_name in self.groups:
+                        group = self.groups[qualified_family_name]
+                    else:
+                        group = Group(family, qualified_family_name,
+                                      self.sequence_proxy, self.tmp_dir)
+                        self.groups[qualified_family_name] = group
+
+                    group.add_region(
+                        read_id, pathogen_locations, human_mapping_locations)
+
+        # Exclude groups that have collected to few reads
+        for qualified_family_name, group in self.groups.items():
+            if len(group.regions) < min_read_number:
+                del self.groups[qualified_family_name]
+            else:
+                logging.debug(
+                    'GroupGenerator: made new group for family %s' % qualified_family_name)
+
+        return self.groups
+
+
+class Region:
+
+    def __init__(self, sequence_proxy, tmp_dir=None):
+
+        self.sequence_proxy = sequence_proxy
+
+        self.references = defaultdict(set)
+        self.reads = set()
+        self.transcripts = set()
+        self.length = None
+
+        self.sorted_reference_positions = None
+
+        self.master_tmp = tmp_dir
+        self.tmp_dir = None
+
+        self.unaligned_fasta_path = None
+        self.alignment_sam_path = None
+        self.consensus_fasta_path = None
+
+    def add_reference(self, name, start, end):
+
+        assert start < end
+        self.length = None
+        self.longest_reference_id = None
+        self.references[name].add((start, end))
+
+    def add_read(self, name):
+
+        self.reads.add(name)
+
+    def add_transcript(self, name):
+
+        self.transcripts.add(name)
+
+    def get_tmp_path(self):
+
+        if self.tmp_dir:
+            return self.tmp_dir
+
+        if self.master_tmp:
+            self.tmp_dir = tempfile.mkdtemp(
+                dir=os.path.abspath(os.path.expandvars(self.master_tmp)))
+        else:
+            self.tmp_dir = tempfile.mkdtemp()
+
+        return self.tmp_dir
+
+    def _distance(self, range1, range2):
+
+        first, second = sorted((range1, range2))
+
+        if first[0] <= first[1] < second[0]:
+            return (second[0] - first[1])
+        else:
+            return 0
+
+    def _delete_temporary_dir(self):
+
+        self.unaligned_fasta_path = None
+        if self.tmp_dir:
+            try:
+                rmtree(self.tmp_dir)
+            except OSError:
+                pass
+
+    def _get_unaligned_fasta_path(self):
+
+        output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa')
+        logging.debug('Region: writing unaligned fasta to %s' % output_path)
+
+        assert self.reads
+
+        with open(output_path, 'w',  buffering=100 * 1024 * 1024) as output_handler:
+
+            # Cut and write references
+            human_positions = []
+            pathogen_positions = []
+            for length, identifier, start, end in self.get_sorted_reference_positions():
+                if ';Homo_sapiens;' in identifier:
+                    human_positions.append((identifier, start, end))
+                else:
+                    pathogen_positions.append((identifier, start, end))
+
+            for identifier, start, end in pathogen_positions + human_positions:
+                record = self.sequence_proxy.get_reference_record(identifier)
+                if record:
+                    record = SeqRecord(record.seq, record.id, '', '')
+                    #record.id += ';%i-%i' % (start, end)
+                    SeqIO.write(
+                        [record[start - 1:end]], output_handler, "fasta")
+                else:
+                    pass
+                    #logging.debug('Region: could not retrieve reference %s' % identifier)
+
+            # Write full-length transcripts
+            for identifier in self.transcripts:
+                record = self.sequence_proxy.get_transcript_record(identifier)
+                if record:
+                    record = SeqRecord(record.seq, record.id, '', '')
+                    SeqIO.write([record], output_handler, "fasta")
+                else:
+                    logging.debug(
+                        'Region: could not retrieve reference %s' % identifier)
+
+            # Write reads
+            for read_id in sorted(self.reads):
+                record = self.sequence_proxy.get_read_record(read_id)
+                if record:
+                    # Transform read ID so that LASTZ can work with it (it
+                    # makes nicknames based on some characters )
+                    identifier = record.id.replace(
+                        ':', '_').replace('|', '_').replace('/', '_')
+                    record = SeqRecord(record.seq, identifier, '', '')
+                    SeqIO.write([record], output_handler, "fasta")
+                else:
+                    logging.debug(
+                        'Region: could not retrieve read %s' % read_id)
+
+            self.unaligned_fasta_path = output_path
+
+        return self.unaligned_fasta_path
+
+    def get_unaligned_fasta_path(self):
+
+        if not self.unaligned_fasta_path:
+            self._get_unaligned_fasta_path()
+
+        return self.unaligned_fasta_path
+
+    def _align_fastas(self, aligner, assert_record=None):
+
+        # Write reference fasta
+        queries_path = self.get_unaligned_fasta_path()
+        first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next()
+        reference_path = os.path.join(self.get_tmp_path(), 'reference.fa')
+
+        logging.debug(
+            'Region: writing longest reference to %s' % reference_path)
+
+        SeqIO.write([first_record], reference_path, "fasta")
+
+        # Do alignment
+        bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam')
+        logging.debug(
+            'Region: starting alignment using queries %s to sam path %s' %
+            (queries_path, bam_path))
+
+        aligner.align_to_bam_file(
+            reference_path, queries_path, bam_path, assert_record=assert_record)
+
+        return bam_path
+
+    def _build_alignment_consensus(self, consensus_builder, alignment_bam_path):
+
+        consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa')
+        logging.debug('Region: writing consensus to %s' % consensus_path)
+
+        consensus_builder.run(alignment_bam_path, consensus_path)
+
+        return consensus_path
+
+    def get_alignment_bam_path(self, aligner, assert_record=None):
+
+        return self._align_fastas(aligner, assert_record)
+
+    def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path):
+
+        return self._build_alignment_consensus(consensus_builder, alignment_bam_path)
+
+    def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path):
+
+        png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png')
+
+        return jalview_runner.run(consensus_fasta_path, png_path)
+
+    def overlaps(self, other, max_gap_length):
+
+        overlap = (set(self.references) & set(other.references))
+
+        # Overlap is solely based on non-human references
+        overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref]
+
+        if not overlap:
+            return False
+
+        for reference in overlap:
+            reflist = self.references[reference]
+            for start, end in reflist:
+                for other_start, other_end in other.references[reference]:
+                    distance = self._distance((start, end),
+                                             (other_start, other_end))
+                    if distance <= max_gap_length:
+                        return True
+        return False
+
+    def merge(self, other):
+
+        for reference, reflist in other.references.iteritems():
+            for (start, end) in reflist:
+                self.add_reference(reference, start, end)
+
+        for read in other.reads:
+            self.add_read(read)
+
+    def clean_references(self, max_gap_length):
+
+        for reference, reflist in self.references.iteritems():
+
+            start_list = sorted(reflist)
+            saved = list(start_list[0])
+            result = set()
+
+            for item in start_list:
+                if self._distance(saved, item) <= max_gap_length:
+                    saved[1] = max(saved[1], item[1])
+                else:
+                    result.add(tuple(saved))
+                    saved[0] = item[0]
+                    saved[1] = item[1]
+            result.add(tuple(saved))
+            self.references[reference] = result
+
+    def to_sorted_record_ids(self):
+
+        references = []
+        for name, locations in self.references.iteritems():
+            for start, end in locations:
+                references.append(end - start, name)
+
+        reads = defaultdict(list)
+        for name, locations in self.references.iteritems():
+            condition = name.split(';')[1]
+            for start, end in locations:
+                reads[condition].append(end - start, name)
+
+        all_record_ids = sorted(references, reverse=True)
+        for condition, the_reads in reads.iteritems():
+            all_record_ids += sorted(the_reads, reverse=True)
+
+        for length, record_id in all_record_ids:
+            yield record_id
+
+    def __len__(self):
+
+        return len(self.reads)
+
+    def __cmp__(self, other):
+
+        if self.get_longest_reference_length() <\
+                other.get_longest_reference_length():
+            return -1
+        elif self.get_longest_reference_length() ==\
+                other.get_longest_reference_length():
+            return 0
+        return 1
+
+    def get_longest_reference_length(self):
+
+        if self.length is not None:
+            return self.length
+
+        positions = self.get_sorted_reference_positions()
+        if positions:
+            self.length = self.get_sorted_reference_positions()[0][0]
+        else:
+            self.length = 0
+
+        return self.length
+
+    def get_sorted_reference_positions(self):
+
+        if self.sorted_reference_positions is not None:
+            return self.sorted_reference_positions
+
+        lengths = []
+        for reference, the_set in self.references.iteritems():
+            for start, end in the_set:
+                lengths.append([end - start, reference, start, end])
+
+        self.sorted_reference_positions = sorted(lengths, reverse=True)
+
+        return self.sorted_reference_positions
+
+    def __str__(self):
+        return '
 of length %10i with %10i reads and %5i references' %\
+            (self.get_longest_reference_length(),
+             len(self.reads), len(self.references))
+
+
+class RegionStatistics:
+
+    def __init__(self, input_dir):
+
+        self.input_dir = input_dir
+        self.stats = []
+
+    def _add_sequence(self, qualified_name, region_id,
+                      longest_human_reference, longest_pathogen_reference, record):
+
+        state = 'pathogen'
+        if longest_human_reference:
+            state = 'ambiguous'
+
+        reference_type = qualified_name.split('_')[0]
+        family_name = '_'.join(qualified_name.split('_')[1:])
+
+        read_type, sample_id, number_reads, basepairs = record.id.split(';')
+
+        coverage = int(basepairs) / float(len(longest_pathogen_reference))
+
+        self.stats.append(
+            [reference_type, family_name, region_id, sample_id, state,
+             number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage])
+
+    def run(self, output_file):
+
+        for qualified_family_name in os.listdir(self.input_dir):
+
+            family_dir = os.path.join(self.input_dir, qualified_family_name)
+
+            if not os.path.isdir(family_dir):
+                continue
+
+            consensus_regions = glob.glob(os.path.join(
+                family_dir, 'region_*_consensus.fa'))
+
+            for consensus_region in consensus_regions:
+
+                base_bame = os.path.basename(consensus_region)
+
+                region_id = base_bame.split('_')[1]
+
+                reads = []
+                longest_human_reference = None
+                longest_pathogen_reference = None
+
+                for consensus_record in SeqIO.parse(consensus_region, 'fasta'):
+                    if consensus_record.id.lower().startswith('read'):
+                        # ends with read number; cumulative bases
+                        reads.append(consensus_record)
+                    elif ';Homo_sapiens;' in consensus_record.id:
+                        if longest_human_reference is None or\
+                                len(consensus_record) > longest_human_reference:
+                            longest_human_reference = consensus_record
+                    else:
+                        if longest_pathogen_reference is None or\
+                                len(consensus_record) > longest_pathogen_reference:
+                            longest_pathogen_reference = consensus_record
+
+                for read in reads:
+                    self._add_sequence(qualified_family_name, region_id,
+                                       longest_human_reference, longest_pathogen_reference, read)
+
+        with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler:
+            output_handler.write(
+                '\t'.join(['reference_type', 'family_name', 'region_id',
+                        'sample_id', 'taxonomic_origin', 'number_reads',
+                        'region_length', 'basepairs', 'coverage']) + '\n')
+            for entries in sorted(self.stats):
+                line = '\t'.join(entries) + '\n'
+                output_handler.write(line)
+
+
+@CLI.subcommand("regions")
+class RegionRunner(cli.Application):
+
+    """ Derive homologous groups and regions from hit files """
+
+    hit_files = cli.SwitchAttr(
+        ['-v', '--virana_hits'], str, list=True, mandatory=True,
+        help="Add hit file for analysis. May be supplied multiple times.")
+
+    lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False,
+                                help="Path to lastz executable",
+                                default='')
+
+    jalview_jar_dir = cli.SwitchAttr(
+        ['-j', '--jalview_jar_dir'], str, mandatory=False,
+        help="Directory containing the jalview.jar file",
+        default=None)
+
+    references_path = cli.SwitchAttr(
+        ['-r', '--references'], str, mandatory=True,
+        help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)",
+        default='')
+
+    cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False,
+                               help="Input fasta file containing human cDNA records.",
+                               default=None)
+
+    output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True,
+                                help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.",
+                                default='')
+
+    tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False,
+                             help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.",
+                             default=None)
+
+    stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False,
+                               help="Path to output statistics tab-delimited text file.",
+                               default='')
+
+    reference_database_filter = cli.SwitchAttr(
+        ['-b', '--reference_database_filter'], str, list=True, mandatory=False,
+        help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.",
+        default=[])
+
+    pathogen_family_filter = cli.SwitchAttr(
+        ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False,
+        help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.",
+        default=[])
+
+    min_read_number = cli.SwitchAttr(
+        ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False,
+        help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.",
+        default=5)
+
+    max_gap_length = cli.SwitchAttr(
+        ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False,
+        help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.",
+        default=25)
+
+    min_region_length = cli.SwitchAttr(
+        ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False,
+        help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.",
+        default=50)
+
+    word_length = cli.SwitchAttr(
+        ['-w', '--word_length'], cli.Range(1, 21), mandatory=False,
+        help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.",
+        default=7)
+
+    ambiguity_cutoff = cli.SwitchAttr(
+        ['-a', '--ambiguity_cutoff'], float, mandatory=False,
+        help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.",
+        default=0.3)
+
+    debug = cli.Flag(
+        ["-d", "--debug"], help="Enable debug information")
+
+    def main(self):
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        # Make sequence proxy for managing hit files, references, and cdna
+        proxy = SequenceProxy(self.references_path, self.cdna_path)
+        for hit_file in self.hit_files:
+            proxy.add_hit_file(hit_file)
+
+        # Generate homologous groups
+        generator = GroupGenerator(proxy,
+                                   reference_database_filter=self.reference_database_filter,
+                                   pathogen_family_filter=self.pathogen_family_filter,
+                                   tmp_dir=self.tmp_dir)
+        groups = generator.get_groups(min_read_number=self.min_read_number)
+
+        # Prepare analysis modules ('runners') for postprocessing homologous
+        # regions
+        consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff)
+        lastz = LastzRunner(word_length=self.word_length)
+
+        if self.jalview_jar_dir:
+            jalview = JalviewRunner(
+                jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir)
+        else:
+            jalview = None
+
+        # Make homologous regions within each homologous group
+        for name, group in groups.iteritems():
+            group.merge_regions(max_gap_length=self.max_gap_length)
+            group.filter_regions(
+                min_region_length=self.min_region_length, min_read_number=self.min_read_number)
+
+            if self.cdna_path:
+                group.add_transcripts_to_regions()
+
+            group.write_outputs(self.output_dir, lastz, consensus, jalview)
+            group._delete_temporary_dir()
+
+        # Run statistics on all homologous regions
+        if self.stat_path:
+            statistics = RegionStatistics(self.output_dir)
+            statistics.run(self.stat_path)
+
+if __name__ == "__main__":
+    CLI.run()
diff -r 000000000000 -r 3ba5983012cf vhom_region.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vhom_region.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,91 @@
+
+
+Derive homologous groups and regions from hit files.  
+
+	
+        lastz 
+        virana_python 
+        numpy 
+        biopython 
+        matplotlib 
+        htseq 
+     
+
+    vhom_regions.py --html_file $hom_output --directory ${hom_output.files_path}
+    
+    --virana_hits $virana_hits
+    
+    #if $cdna
+        --cdna $cdna
+    #end if
+    
+    --references $references
+    
+    --ambiguity_cutoff $ambiguity_cutoff
+    
+    --max_gap_length $max_gap_length
+    
+    --min_read_number $min_read_number
+    
+    --word_length $word_length
+    
+    --min_region_length $min_region_length
+    
+    #if $reference_database_filter
+        #set $groups = str($reference_database_filter).split(",")
+        
+        #for $ref in $groups
+            --reference_database_filter $ref
+        #end for
+    #end if
+    
+    #for $i, $s in enumerate( $filter )
+        
+        --pathogen_family_filter $s
+        
+    #end for
+    
+    --output_dir ${hom_output.files_path}
+    
+    --jalview_jar_dir ./jalview
+    
+    --region_stats $stat_output
+     
+    
+    
+    
+        
+        nr 
+            nt 
+        
+        
+        
+             
+    
+     
+    
+    
+    
+         
+ 
+     
diff -r 000000000000 -r 3ba5983012cf vhom_regionplot.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vhom_regionplot.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,255 @@
+import matplotlib.pyplot as plt
+
+from matplotlib import rcParams
+from matplotlib import colors
+from matplotlib import cm
+
+from argparse import ArgumentParser
+from matplotlib.backends.backend_pdf import PdfPages
+
+import os
+
+import colorsys
+
+
+
+
+rcParams['figure.figsize'] = (6, 6)
+rcParams['figure.dpi'] = 100
+rcParams['axes.facecolor'] = 'white'
+rcParams['font.size'] = 10
+rcParams['patch.edgecolor'] = 'white'
+rcParams['patch.linewidth']=0.5
+rcParams['patch.facecolor'] = 'black'
+rcParams['font.family'] = 'StixGeneral'
+
+
+def color_variants(r,g,b):
+    h,l2,s= colorsys.rgb_to_hls(r,g,b)
+    l=[l2-0.4,l2-0.1,l2+0.2]
+    hexs=[]
+    for i in l:
+        hexs.append(colors.rgb2hex(colorsys.hls_to_rgb(h,i,s)))
+    return hexs
+    
+def rgb_color_variants(r,g,b):
+    h,l2,s= colorsys.rgb_to_hls(r,g,b)
+    l=[l2-0.4,l2-0.1,l2+0.2]
+    rgbs={}
+    rgbs["pathogen"]=colorsys.hls_to_rgb(h,l[0],s)
+    rgbs["ambiguous"]=colorsys.hls_to_rgb(h,l[1],s)
+    rgbs["human"]=colorsys.hls_to_rgb(h,l[2],s)
+    return rgbs
+
+                
+     
+
+def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
+    """
+    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
+    
+    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
+    """
+    ax = axes or plt.gca()
+    ax.spines['top'].set_visible(top)
+    ax.spines['right'].set_visible(right)
+    ax.spines['left'].set_visible(left)
+    ax.spines['bottom'].set_visible(bottom)
+    
+    #turn off all ticks
+    ax.yaxis.set_ticks_position('none')
+    ax.xaxis.set_ticks_position('none')
+    
+    #now re-enable visibles
+    if top:
+        ax.xaxis.tick_top()
+    if bottom:
+        ax.xaxis.tick_bottom()
+    if left:
+        ax.yaxis.tick_left()
+    if right:
+        ax.yaxis.tick_right()
+        
+        
+def generateHTML(stat_dict,file,directory):
+    rval=["Homologous groups and regions derived from hit files  
\n"]
+    rval.append("Download Plot ")
+    n_samples=0
+    for sample in stat_dict.iterkeys():
+        n_samples+=1
+        rval.append("
"%(sample))
+    rval.append("")
+    rval.append("")
+    rval.append('")
+    
+    rval.append( '' )
+    with open(file,'w') as file:
+        file.write("\n".join( rval ))
+    
+def generatePyPlot(stat_dict,output_file):
+    pp = PdfPages(output_file)
+    fig = plt.figure()
+    
+    n_samples=len(stat_dict)
+    dict={}
+    position={}
+    dict_label={}
+    for sample in stat_dict.iterkeys():
+        dict[sample]=[{},{}]
+        position[sample]={}
+        dict_label[sample]={}
+        j=0
+        for family in stat_dict[sample].iterkeys():
+         
+            i=0
+            for region,values in stat_dict[sample][family].iteritems():
+                
+                if i not in dict[sample][0]:
+                    dict[sample][0][i]=[]
+                    dict[sample][1][i]=[]
+                    position[sample][i]=[]
+                    dict_label[sample][i]=[]
+                dict[sample][0][i].append(int(values[0][1]))
+                dict[sample][1][i].append(int(values[0][2]))
+                position[sample][i].append(j)
+                dict_label[sample][i].append(values[0][0])
+                i+=1
+            j+=1
+                
+    i=0    
+    for sample in dict.iterkeys():
+        fig = plt.figure()
+        fig.subplots_adjust(bottom=0.25,hspace=0.5)
+        r,g,b,a=cm.hsv(1.*i/n_samples)
+        color_map=rgb_color_variants(r,g,b)
+        for sub in [0,1]:
+          plt.subplot(1,2,sub+1)
+          bottom={}
+          for level,values in dict[sample][sub].iteritems():
+              colors=[color_map[r] for r in dict_label[sample][level]]
+              if level==0:
+                  print level, values, position[sample][level],bottom
+                  plt.bar(position[sample][level],values,bottom=0, width=0.8 ,color=colors)
+              else:
+                  lastvalues=[]
+                  for oldpos in range(len(values)):
+                      lastvalues.append(bottom[position[sample][level][oldpos]])         
+                  print level, values, position[sample][level], lastvalues
+                  plt.bar(position[sample][level],values, width=0.8,bottom=lastvalues ,color=colors)
+              for pos in range(len(values)):
+                  if position[sample][level][pos] not in bottom:
+                        bottom[position[sample][level][pos]]=0
+                  else:
+                        bottom[position[sample][level][pos]]+=values[pos]
+          pos=[x+0.4 for x in range(len(stat_dict[sample]))]    
+          plt.xticks(pos, stat_dict[sample].keys(), rotation='vertical')
+          if(sub==0):
+            plt.ylabel("Reads")
+          else:
+            plt.ylabel("Basepairs")
+          plt.xlabel("")
+          fig.subplots_adjust(bottom=0.25,wspace=0.5)
+          remove_border()
+        plt.suptitle(sample)
+        pp.savefig(fig)
+        i+=1
+    pp.close()
+    
+
+
+def parseStatFile(filename):
+
+    with open(filename,'r') as file:     
+        values=[ map(str,line.split('\t')) for line in file ]
+
+        dict={}
+        for line in values[1:]:
+            if line[3] not in dict:
+                dict[line[3]]={}
+                dict[line[3]][line[1]]={}
+                dict[line[3]][line[1]][line[2]]=[[line[4],line[5],line[7],line[0],line[6]]]
+            else:
+                if line[1] not in dict[line[3]]:
+                    dict[line[3]][line[1]]={}
+                    dict[line[3]][line[1]][line[2]]=[[line[4],line[5],line[7],line[0],line[6]]]
+                else:
+                    if line[2] not in dict[line[3]][line[1]]:
+                        dict[line[3]][line[1]][line[2]]=[[line[4],line[5],line[7],line[0],line[6]]]
+                    else:
+                         dict[line[3]][line[1]][line[2]].append([line[4],line[5],line[7],line[0],line[6]])        
+#        for key in dict.iterkeys():
+#            for family_key,values in dict[key].iteritems():
+#                for line in values:
+#                    print key,family_key,line
+        return dict
+def getFiles(directory):
+    rval={}
+    dlist = os.listdir(directory)
+    for dire in dlist:
+        if(os.path.isdir(os.path.join(directory,dire))):
+             rval[dire]={}
+             flist = os.listdir(os.path.join(directory,dire))
+             for file in flist:
+                split=file.split("_")
+                region=split[1]
+                if region not in rval[dire]:
+                    rval[dire][region]=[file]
+                else:
+                    rval[dire][region].append(file)
+                         
+    return rval
+
+        
+
+if __name__ == "__main__":
+    
+    
+    parser = ArgumentParser()
+    
+    a = parser.add_argument
+    a("--html_file",dest="html_file")
+    a("--directory",dest="directory")
+    a("--stats",dest="stat_file")
+      
+    (options,args)= parser.parse_known_args()
+    
+#    args.insert(0,"dummy")
+#    try:
+#        RegionRunner.run(argv=args)
+#    except SystemExit:
+
+    stat_dict=parseStatFile(options.stat_file)
+    generatePyPlot(stat_dict,os.path.join(options.directory,"plot.pdf"))
+    generateHTML(stat_dict,options.html_file,options.directory)
+    
+        
+                        
+ 
+ 
\ No newline at end of file
diff -r 000000000000 -r 3ba5983012cf vhom_regions.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vhom_regions.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,45 @@
+from argparse import ArgumentParser
+import os
+from vhom import RegionRunner
+
+
+def printDir(directory):
+    rval=[""]
+    flist = os.listdir(directory)
+    
+    for entry in flist:
+        if(os.path.isdir(os.path.join(directory,entry))):
+             rval.append('%s '%entry)
+             rval.extend(printDir(os.path.join(directory,entry)))
+        else:
+            rval.append( '%s  ")
+    return rval
+
+
+
+if __name__ == "__main__":
+    
+    
+    parser = ArgumentParser()
+    
+    a = parser.add_argument
+    a("-o","--html_file",dest="html_file")
+    a("-d","--directory",dest="directory")
+      
+    (options,args)= parser.parse_known_args()
+    
+    args.insert(0,"dummy")
+    try:
+        RegionRunner.run(argv=args)
+    except SystemExit:
+        f = open(options.html_file,'w')
+        rval = ["Homologous groups and regions derived from hit files  
\n"]
+        rval.append('This composite dataset is composed of the following files:
')
+        directory= options.directory
+        rval.extend(printDir(directory))
+        
+        rval.append( '' )
+    
+        f.write("\n".join( rval ))
+        f.close() 
diff -r 000000000000 -r 3ba5983012cf vmap.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmap.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,1425 @@
+#!/usr/bin/env python
+#from __future__ import print_function
+
+import cProfile
+
+import sys
+import re
+
+import tempfile
+import subprocess
+import shutil
+import os
+import os.path
+import logging
+import bz2
+import zlib
+
+import math
+import string
+
+from collections import defaultdict, Counter
+
+from subprocess import PIPE
+
+NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
+NON_ID = NON_ID.replace('_', '').replace('-', '')
+
+try:
+    from Bio.SeqRecord import SeqRecord
+    from Bio import SeqIO
+    from Bio.Seq import Seq
+except ImportError:
+    message = 'This script requires the BioPython python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    from plumbum import cli
+except ImportError:
+    message = 'This script requires the plumbum python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+
+try:
+    import HTSeq
+except ImportError:
+    message = 'This script requires the HTSeq python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+KHMER_AVAILABLE = True
+try:
+    import khmer
+except ImportError:
+    KHMER_AVAILABLE = False
+
+#from io import BufferedRandom
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def profile_this(fn):
+    def profiled_fn(*args, **kwargs):
+        fpath = fn.__name__ + ".profile"
+        prof = cProfile.Profile()
+        ret = prof.runcall(fn, *args, **kwargs)
+        prof.dump_stats(fpath)
+        return ret
+    return profiled_fn
+
+
+class CLI(cli.Application):
+    """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences"""
+    PROGNAME = "vmap"
+    VERSION = "1.0.0"
+    DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can
+                     generate reference indexes for the short read mappers STAR (RNA-Seq) and
+                     BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of
+                     reference databases and the results can be summarized by taxonomic family
+                     as well as stored as SAM file, unsorted BAM file, or as a HIT file that
+                     models multimapping reads between specific reference databases."""
+    USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`"""
+
+    def main(self, *args):
+
+        if args:
+            print("Unknown command %r" % (args[0]))
+            print self.USAGE
+            return 1
+
+        if not self.nested_command:
+            print("No command given")
+            print self.USAGE
+            return 1
+
+
+@CLI.subcommand("rnaindex")
+class RNAIndex(cli.Application):
+    """ Creates a STAR index from a FASTA genome reference """
+
+    reference_files = cli.SwitchAttr(
+        ['-r', '--reference_file'], str, list=True, mandatory=True,
+        help="Sets the reference genome(s) FASTA file." +
+        " Multiple occurrences of this parameter are allowed.")
+    index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+                               help="Sets the index output directory." +
+                               " Directory will be generated if not existing." +
+                               " Directory will be filled with several index files.")
+    threads = cli.SwitchAttr(
+        ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+        help="Sets the number of threads to use",
+        default=1)
+    max_ram = cli.SwitchAttr(
+        ['-m'], cli.Range(1, 400000000000), mandatory=False,
+        help="Sets the maximum amount of memory (RAM) to use (in bytes)",
+        default=400000000000)
+    path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
+                          help="Path to STAR executable",
+                          default='')
+    sparse = cli.Flag(
+        ["-s", "--sparse"], help="If given, a sparse index that requires less " +
+        " RAM in the mapping phase will be constructed")
+
+    debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
+
+    def main(self):
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        # Obtain star executable
+        star = [self.path and self.path or 'STAR']
+
+        # Check if genome directory is existing
+        for reference_file in self.reference_files:
+            if not os.path.exists(reference_file):
+                sys.stdout.write(
+                    'Reference file %s nor existing, exiting' % reference_file)
+                sys.exit(1)
+
+        # Check if output directory is existing
+        if not os.path.exists(self.index_dir):
+            logging.debug(
+                'Making output directory for index at %s' % self.index_dir)
+            os.makedirs(self.index_dir)
+
+        # # Make named pipe to extract genomes
+        # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa'))
+        # if os.path.exists(pipe_path):
+        #     os.unlink(pipe_path)
+        # os.mkfifo(pipe_path)
+
+        # Make star command line
+        cline = star + ['--runMode', 'genomeGenerate',
+                        '--genomeDir', self.index_dir,
+                        '--limitGenomeGenerateRAM', str(self.max_ram),
+                        '--runThreadN', str(self.threads),
+                        '--genomeFastaFiles'] + self.reference_files
+
+        # Add parameters for sparse (memory-saving) index generation
+        if self.sparse:
+            cline += ['--genomeSAsparseD', '2',
+                      '--genomeChrBinNbits', '12',
+                      '--genomeSAindexNbases', '13']
+
+        else:
+            cline += ['--genomeSAsparseD', '1',
+                      '--genomeChrBinNbits', '18',
+                      '--genomeSAindexNbases', '15']
+
+        if self.debug:
+            print ' '.join(cline)
+
+        # Run STAR reference generation process
+        star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+
+        # Block until streams are closed by the process
+        stdout, stderr = star_process.communicate()
+
+        if stderr:
+            sys.stderr.write(stderr)
+
+        if self.debug and stdout:
+            print stdout
+
+
+@CLI.subcommand("dnaindex")
+class DNAIndex(cli.Application):
+    """ Creates a BWA index from a FASTA reference file """
+
+    reference_file  = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True,
+                                 help="Sets the input reference FASTA file.")
+    index_dir   = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+                                 help="Sets the index output directory." +
+                                 " Directory will be generated if not existing." +
+                                 " Directory will be filled with several index files.")
+    path        = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
+                                 help="Path to BWA executable",
+                                 default='')
+    debug       = cli.Flag(["-d", "--debug"], help="Enable debug output")
+
+    if debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    def main(self):
+
+        # Obtain star executable
+        bwa = [self.path and self.path or 'bwa']
+
+        # Check if genome directory is existing
+        if not os.path.exists(self.reference_file):
+            sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file)
+            sys.exit(1)
+
+        # Check if output directory is existing
+        if not os.path.exists(self.index_dir):
+            logging.debug('Making output directory %s' % self.index_dir)
+            os.makedirs(self.index_dir)
+
+        # Make star command line
+        cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file]
+
+        if self.debug:
+            print ' '.join(cline)
+
+        # Run BWA index generation process
+        bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+        stdout, stderr = bwa_process.communicate()
+
+        if stderr:
+            sys.stderr.write(stderr)
+
+        if self.debug and stdout:
+            print stdout
+
+
+
+
+
+class SAMHits:
+    """ Converts SAM output of mappers into bzipped HIT files. """
+
+    def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\
+                     min_alignment_score=None, max_mismatches=None,\
+                     max_relative_mismatches=None, min_continiously_matching=None,\
+                     filter_complexity=False):
+
+        self.output_file                = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024)
+        self.sample_id                  = sample_id.translate(None, NON_ID)
+        self.refseq_filter              = refseq_filter
+        self.max_mismatches             = max_mismatches
+        self.max_relative_mismatches    = max_relative_mismatches
+        self.current_group              = []
+        self.min_mapping_score          = min_mapping_score
+        self.min_alignment_score        = min_alignment_score
+        self.min_continiously_matching  = min_continiously_matching
+        self.filter_complexity          = filter_complexity
+
+        self.re_matches = re.compile(r'(\d+)M')
+        self.re_dels    = re.compile(r'(\d+)D')
+
+    def count(self, parsed_line):
+
+        if parsed_line is None:
+            return
+
+        read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+                    cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+                    is_end1, is_end2, number_mismatches, alignment_score,\
+                    number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+                    is_paired, number_matches, read_end_pos, max_match = parsed_line
+
+        if not is_mapped:
+            return
+
+        if self.filter_complexity:
+
+            avg_compression = float(len(zlib.compress(seq)))/len(seq)
+
+            if avg_compression < 0.5:
+                return
+
+            # length = len(seq)
+            # counts = [seq.count(nuc) for nuc in 'ACGT']
+            # min_count = length * 0.10
+            # max_count = length * 0.50
+            # for count in counts:
+            #     if count < min_count or count > max_count:
+            #         return None
+
+            # counter = Counter()
+            # for i in range(length - 2):
+            #     counter[seq[i: i + 3]] += 1
+            # maximal = length - 4
+
+            # highest = sum([v for k, v in counter.most_common(2)])
+            # if highest > (maximal / 3.0):
+            #     return None
+
+            # self.passed.append(avg_compression)
+
+        pair_id = ''
+        if is_end1:
+            pair_id = '/1'
+
+        elif is_end2:
+            pair_id = '/2'
+
+        read_name = self.sample_id + ';' + read_name + pair_id
+
+        # Initialize new current group
+        if len(self.current_group) == 0:
+            self.current_group = [read_name, seq, []]
+
+        # Write old current group to file
+        if read_name != self.current_group[0]:
+            self._write_group()
+            self.current_group = [read_name, seq, []]
+
+        try:
+            refseq_group, family, organism, identifier = ref_name.split(';')[:4]
+        except ValueError:
+            sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name)
+            return
+
+        if self.min_continiously_matching:
+
+            if self.min_continiously_matching > max_match:
+                return
+
+        if self.max_mismatches\
+            and int(number_mismatches) > self.max_mismatches:
+            return
+
+        if self.max_relative_mismatches\
+            and int(number_mismatches) / float(len(seq))\
+            > self.max_relative_mismatches:
+            return
+
+        if self.min_mapping_score\
+            and self.min_mapping_score > mapping_score:
+            return
+
+        if self.min_alignment_score\
+            and self.min_alignment_score > alignment_score:
+            return
+
+        start = int(ref_position) + 1
+
+        self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)])
+
+    def _write_group(self):
+        passed = True
+
+        if self.refseq_filter:
+            passed = False
+            for refseq_group, family, organism, identifier, start, end in self.current_group[2]:
+                if passed:
+                    break
+                for f in self.refseq_filter:
+                    if refseq_group == f:
+                        passed = True
+                        break
+        if passed:
+            description = []
+            for identifier in self.current_group[2]:
+                description.append(';'.join(identifier))
+            description = '|'.join(description)
+
+            record = SeqRecord(Seq(self.current_group[1]))
+            record.id = 'Read;' + self.current_group[0]
+            record.description = description
+
+            SeqIO.write([record], self.output_file, "fasta")
+
+    def write(self):
+
+        self._write_group()
+        self.output_file.close()
+
+class SAMParser:
+
+    def parse(self, line):
+
+        if line[0] == '@':
+            return None
+
+        alignment   = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line)
+        read_name   = alignment.read.name
+        seq         = alignment.read.seq
+        qual        = alignment.read.qual
+        flag        = alignment.flag
+        cigar       = None
+
+        is_paired       = (flag & 1)
+        is_mapped       = not (flag & 4)
+        is_mate_mapped  = alignment.mate_aligned is not None #not (flag & 8)
+        is_reverse      = (flag & 16)
+        is_end1         = (flag & 64)
+        is_end2         = (flag & 128)
+        is_primary      = not (flag & 256)
+
+        read_key = (read_name, is_end1)
+
+        ref_name            = None
+        ref_position        = None
+        mapping_score       = 0
+        mate_ref_name       = None
+        mate_ref_position   = None
+        insert_size         = None
+        alignment_score     = 0
+        read_end_pos        = None
+
+        if is_mate_mapped and alignment.mate_start:
+            mate_ref_name       = alignment.mate_start.chrom
+            mate_ref_position   = alignment.mate_start.start
+
+        number_hits         = 0
+        alignment_score     = 0
+        number_mismatches   = 0
+
+        number_matches      = 0
+        max_match           = 0
+
+        if is_mapped:
+
+            ref_name            = alignment.iv.chrom
+            ref_position        = alignment.iv.start
+            read_end_pos        = alignment.iv.end
+            alignment_score     = alignment.aQual
+            cigar               = alignment.cigar
+
+            if is_mate_mapped:
+                insert_size         = alignment.inferred_insert_size
+
+            for c in cigar:
+                if c.type == 'M':
+                    number_matches += c.size
+                    max_match = max(max_match, c.size)
+
+            for tag, value in alignment.optional_fields:
+                if tag == 'NM':
+                    number_hits = value
+                elif tag == 'AS':
+                    alignment_score = value
+                elif tag == 'NH':
+                    number_mismatches = value
+
+        return read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+            cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+            is_end1, is_end2, number_mismatches, alignment_score,\
+            number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+            is_paired, number_matches, read_end_pos, max_match
+
+
+class SAMQuality:
+
+    def __init__(self, file_path):
+
+        self.file_path          = file_path
+
+        self.stored             = defaultdict(Counter)
+        self.all_references     = defaultdict(int)
+        self.primary_references = defaultdict(int)
+        self.complement         = string.maketrans('ATCGN', 'TAGCN')
+
+        if KHMER_AVAILABLE:
+            self.ktable = khmer.new_ktable(10)
+
+    def _get_complement(self, sequence):
+
+        return sequence.translate(self.complement)[::-1]
+
+    def _get_summary(self, counter):
+        """"Returns five numbers (sum, extrema, mean, and std)
+            for a max_frequency counter """
+
+        maximum  = 0
+        minimum  = sys.maxint
+        thesum   = 0
+        allcount = 0
+        mode     = [0, None]
+
+        items       = 0.0
+        mean        = 0.0
+        m2          = 0.0
+        variance    = 0.0
+
+        for item in counter:
+
+            count   = counter[item]
+            if count > mode[0]:
+                mode = [count, item]
+
+            allcount += count
+            maximum = max(maximum, item)
+            minimum = min(minimum, item)
+            thesum  += (count * item)
+
+            x = 1
+            while x <= count:
+                items       += 1
+                delta       = item - mean
+                mean        = mean + delta / items
+                m2          = m2 + delta * (item - mean)
+                variance    = m2 / items
+                x += 1
+
+        std = math.sqrt(variance)
+
+        return allcount, thesum, minimum, maximum, mode[1], mean, std
+
+
+    def _to_unit(self, item, is_percentage=False):
+        """ Convert a numeric to a string with metric units """
+
+        if is_percentage:
+            return ('%-.3f' % (item * 100)) + '%'
+        converted = None
+        try:
+            item = float(item)
+            if item > 10**12:
+                converted = str(round(item / 10**9,3))+'P'
+            elif item > 10**9:
+                converted = str(round(item / 10**9,3))+'G'
+            elif item > 10**6:
+                converted = str(round(item / 10**6,3))+'M'
+            elif item > 10**3:
+                converted = str(round(item / 10**3,3))+'K'
+            else:
+                converted = str(round(item,3))
+        except:
+            converted = str(item)
+
+        return converted
+
+    def _str_metrics(self, data):
+
+        str_metrics = []
+
+        for (item, metric) in sorted(data.keys()):
+            counter = data[(item, metric)]
+            if not hasattr(counter.iterkeys().next(), 'real'):
+                for element, count in counter.most_common():
+                    str_metrics.append(self._str_metric(item, metric, element, count, no_units=True))
+            else:
+                summary = self._get_summary(counter)
+                str_metrics.append(self._str_metric(item, metric, *summary))
+
+        return str_metrics
+
+    def _str_metric(self, item, metric, count, thesum='', minimum='',\
+                        maximum='', mode='', mean='', std='', no_units=False):
+
+        counters = [count, thesum, minimum, maximum, mode, mean, std]
+        counters = map(str, counters)
+
+        if no_units:
+            items = [item, metric] +  counters
+        else:
+            units = map(self._to_unit, counters)
+            items = [item, metric] + units
+
+        return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \
+                    % tuple(items)
+
+
+    def _count_read(self, metric, data, sample):
+
+        item = 'read'
+
+        (insert_size, alignment_score, mapping_score, length,\
+            q20_length, avg_phred_quality, number_hits, is_reverse) = data
+
+        self.stored[(item, metric + ' mappings')][number_hits] += sample
+        self.stored[(item, metric + ' insert')][insert_size] += sample
+
+
+    def _count_segment(self, metric, data, sample):
+
+        item = 'segment'
+
+        (insert_size, alignment_score, mapping_score, length,\
+            q20_length, avg_phred_quality, number_hits, is_reverse) = data
+
+        self.stored[(item, metric + ' algq')][alignment_score] += sample
+        self.stored[(item, metric + ' mapq')][mapping_score] += sample
+        self.stored[(item, metric + ' length')][length] += sample
+        self.stored[(item, metric + ' q20length')][q20_length] += sample
+        self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample
+        self.stored[(item, metric + ' reverse')][is_reverse] += sample
+
+    def count(self, parsed_line):
+
+        if parsed_line is None:
+            return
+
+        #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD')
+
+        read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+                    cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+                    is_end1, is_end2, number_mismatches, alignment_score,\
+                    number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+                    is_paired, number_matches, read_end_pos, max_match = parsed_line
+
+        phred_quality       = [q - 33 for q in qual]
+        avg_phred_quality   = sum(phred_quality) / float(len(phred_quality))
+        length              = len(seq)
+        mate_reference_id   = mate_ref_name
+        reference_id        = ref_name
+        reference           = reference_id is not None and reference_id != '*'
+        insert_size         = insert_size and abs(insert_size) or insert_size
+        is_segment1         = not is_paired or (is_paired and is_end1)
+        is_reverse          = is_reverse
+        is_unique           = is_primary and number_hits == 1
+        is_translocation    = is_paired and is_mapped and is_mate_mapped\
+                                    and (mate_reference_id != '=' and reference_id != mate_reference_id)
+        is_part_of_doublemap    = is_paired and is_mapped and is_mate_mapped
+        is_part_of_halfmap      = is_paired and (is_mapped != is_mate_mapped)
+        is_part_of_nomap        = is_paired and not is_mapped and not is_mate_mapped
+
+
+        # Count length until first low quality base call
+        q20_length = 0
+        for q in phred_quality:
+            if q < 20:
+                break
+            q20_length += 1
+
+        # Count kmers
+        if KHMER_AVAILABLE:
+            if not is_reverse:
+                self.ktable.consume(seq)
+            else:
+                self.ktable.consume(self._get_complement(seq))
+
+        if reference:
+
+            self.all_references[reference_id] += 1
+
+            if is_primary:
+                self.primary_references[reference_id] += 1
+
+
+        data   = (insert_size, alignment_score, mapping_score, length,\
+                        q20_length, avg_phred_quality, number_hits, is_reverse)
+
+        sample = 1
+
+        self._count_segment('sequenced', data, sample)
+        if is_mapped:
+            self._count_segment('sequenced mapped multi', data, sample)
+            if is_primary:
+                self._count_segment('sequenced mapped primary', data, sample)
+                if number_hits and is_unique:
+                    self._count_segment('sequenced mapped primary unique', data, sample)
+
+            if is_segment1:
+                self._count_read('sequenced mapped multi', data, sample)
+                if is_primary:
+                    self._count_read('sequenced mapped primary', data, sample)
+
+        if is_paired:
+            self._count_segment('sequenced paired', data, sample)
+
+            if is_part_of_doublemap:
+                self._count_segment('sequenced paired doublemap', data, sample)
+
+                if is_primary:
+                    self._count_segment('sequenced paired doublemap primary', data, sample)
+
+                if is_segment1:
+                    self._count_read('sequenced paired doublemap multi', data, sample)
+
+                    if is_primary:
+                        self._count_read('sequenced paired doublemap primary', data, sample)
+
+                        if number_hits and is_unique:
+                            self._count_read('sequenced paired doublemap primary unique', data, sample)
+
+                            if is_translocation:
+                                self._count_read('sequenced paired doublemap primary unique translocation', data, sample)
+
+            elif is_part_of_halfmap:
+
+                self._count_segment('sequenced paired halfmap', data, sample)
+
+                # The mapped segment
+                if is_mapped:
+
+                    self._count_segment('sequenced paired halfmap mapped', data, sample)
+
+                    if is_primary:
+
+                        self._count_read('sequenced paired halfmap mapped primary', data, sample)
+
+                        if number_hits and is_unique:
+                            self._count_read('sequenced paired halfmap mapped primary unique', data, sample)
+
+                    elif not is_primary:
+
+                        self._count_read('sequenced unpaired mapped multi', data, sample)
+
+                # The unmapped segment
+                elif not is_mapped:
+                    self._count_segment('sequenced paired halfmap unmapped', data, sample)
+
+            elif is_part_of_nomap:
+
+                self._count_segment('sequenced paired nomap', data, sample)
+
+                if is_segment1:
+
+                    self._count_read('sequenced paired nomap', data, sample)
+
+        elif not is_paired:
+
+            self._count_segment('sequenced unpaired', data, sample)
+
+            if is_mapped:
+
+                self._count_segment('sequenced unpaired mapped', data, sample)
+
+                if is_primary:
+
+                        self._count_read('sequenced unpaired mapped primary', data, sample)
+
+                        if number_hits and is_unique:
+
+                            self._count_read('sequenced paired unpaired mapped primary unique', data, sample)
+
+                elif not is_primary:
+
+                    self._count_read('sequenced unpaired mapped multi', data, sample)
+
+
+            elif not is_mapped:
+
+                self._count_segment('sequenced unpaired unmapped', data, sample)
+
+                if is_segment1:
+                    self._count_read('sequenced unpaired unmapped', data, sample)
+
+    def write(self):
+
+        with open(self.file_path, 'w') as output_file:
+
+            all_references = sorted([(count, reference) for reference, count\
+                                    in self.all_references.iteritems()], reverse=True)
+
+            for j, (count, reference) in enumerate(all_references[:30]):
+                self.stored[('segment', 'multireference_' + str(j+1))][reference] = count
+
+            primary_references = sorted([(count, reference) for reference, count\
+                                            in self.primary_references.iteritems()], reverse=True)
+
+            for j, (count, reference) in enumerate(primary_references[:30]):
+                self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count
+
+            # Extract top-ranking kmers
+            if KHMER_AVAILABLE:
+                kmer_frequencies = []
+                for i in range(0, self.ktable.n_entries()):
+                    n = self.ktable.get(i)
+                    if n > 0:
+                        kmer_frequencies.append((n, self.ktable.reverse_hash(i)))
+                kmer_frequencies = sorted(kmer_frequencies, reverse=True)
+                for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]):
+                    self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency
+
+            output_file.writelines(self._str_metrics(self.stored))
+
+
+class SAMTaxonomy:
+    """ Provides taxonomic summary information from a SAM file stream. """
+
+    def __init__(self, file_path):
+
+        self.file_path = file_path
+
+        self.count_primaries = Counter()
+        self.detailed_information = {}
+
+        self._last_read                 = (None, None)
+        self._last_read_human_prim      = 0
+        self._last_read_human_sec       = 0
+        self._last_organisms            = set()
+
+    def count(self, parsed_line):
+
+        if parsed_line is None:
+            return
+
+        read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+                    cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+                    is_end1, is_end2, number_mismatches, alignment_score,\
+                    number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+                    is_paired, number_matches, read_end_pos, max_match = parsed_line
+
+        if is_mapped:
+
+            refseq_group, family, organism, gi = ref_name.split(';')[:4]
+
+            if is_primary:
+                self.count_primaries[organism] += 1
+
+            if organism not in self.detailed_information:
+                # refseq_group. family, gis, avg_mapping_score,
+                # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches
+                initial = [refseq_group,
+                           family,
+                           set([gi]),
+                           [int(mapping_score), 1],
+                           [len(seq), 1],
+                           [0, 0],
+                           [alignment_score, 1],
+                           [number_mismatches, 1],
+                           0,
+                           0]
+
+                self.detailed_information[organism] = initial
+
+            else:
+                entry = self.detailed_information[organism]
+                entry[2].add(gi)
+                entry[3][0] += int(mapping_score)
+                entry[3][1] += 1
+                entry[4][0] += len(seq)
+                entry[4][1] += 1
+                entry[6][0] += alignment_score
+                entry[6][1] += 1
+                entry[7][0] += number_mismatches
+                entry[7][1] += 1
+
+            if is_primary:
+                entry = self.detailed_information[organism]
+                entry[5][0] += number_hits
+                entry[5][1] += 1
+
+            if self._last_read == (None, None):
+                self._last_read = read_key
+
+            if self._last_read != read_key:
+
+                for last_organism in self._last_organisms:
+
+                    self.detailed_information[last_organism][8]\
+                        += self._last_read_human_prim
+
+                    self.detailed_information[last_organism][9]\
+                        += self._last_read_human_sec
+
+                self._last_read = read_key
+                self._last_organisms = set()
+                self._last_read_human_prim  = 0
+                self._last_read_human_sec   = 0
+
+            self._last_organisms.add(organism)
+
+            if organism == 'Homo_sapiens':
+                if is_primary:
+                    self._last_read_human_prim += 1
+                else:
+                    self._last_read_human_sec += 1
+
+    def get_summary(self, top=100):
+
+        lines = []
+
+        lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\
+            % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS'))
+
+        top_organisms = self.count_primaries.most_common(top)
+
+        for organism, count in top_organisms:
+
+            refseq_group, family, identifiers,\
+                avg_mapping_score, avg_seq_length, avg_number_hits,\
+                avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\
+                = self.detailed_information[organism]
+
+            avg_len = int(avg_seq_length[0] / float(avg_seq_length[1]))
+            if avg_number_hits[1] == 0:
+                avg_hits = 0
+            else:
+                avg_hits = int(avg_number_hits[
+                               0] / float(avg_number_hits[1]))
+
+            avg_mapping_score = int(avg_mapping_score[
+                                    0] / float(avg_mapping_score[1]))
+
+            avg_alignment_score = int(avg_alignment_score[
+                                      0] / float(avg_alignment_score[1]))
+
+            avg_nr_mismatches = int(avg_nr_mismatches[
+                                    0] / float(avg_nr_mismatches[1]))
+
+            nr_ids = len(identifiers)
+
+            if count > 10**6:
+                count = str(round(count / float(10**6), 3)) + 'M'
+            if human_prim > 10**6:
+                human_prim = str(round(human_prim / float(10**6), 3)) + 'M'
+            if human_sec > 10**6:
+                human_sec = str(round(human_sec / float(10**6), 3)) + 'M'
+            if nr_ids > 10**6:
+                nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M'
+
+            lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\
+                % (str(count), refseq_group[:20], family[:20], organism[:20],\
+                    str(nr_ids), avg_len, avg_hits, avg_mapping_score,\
+                    avg_alignment_score, avg_nr_mismatches, str(human_prim),\
+                    str(human_sec)))
+
+        return lines
+
+    def write(self):
+
+        with open(self.file_path, 'w') as output_file:
+            output_file.writelines(self.get_summary())
+
+@CLI.subcommand("rnamap")
+class RNAmap(cli.Application):
+    """ Map input reads against a STAR index """
+
+    index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+                               help="Sets the index output directory")
+
+    threads = cli.SwitchAttr(
+        ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+        help="Sets the number of threads to use",
+        default=1)
+
+    taxonomy = cli.SwitchAttr(
+        ['-x', '--taxonomy'], str, mandatory=False,
+        help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
+        default='')
+
+    star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False,
+                          help="Path to STAR executable",
+                          default='')
+
+    samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
+                          help="Path to samtools executable",
+                          default='')
+
+    temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
+                          help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
+                          default='')
+
+    min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
+                          help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
+                          help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False,
+                          help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
+                          help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
+                          help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    filter_complexity = cli.Flag(['--filter_complexity'],
+                          help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
+                          default=False)
+
+    bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
+                         help="Path to unsorted, unindexed output BAM file",
+                         default='')
+
+    sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
+                         help="Path to output SAM file",
+                         default='')
+
+    qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
+                         help="Path to output quality file",
+                         default='')
+
+    hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
+                          help="Path to bzip2-compressed tab-delimited output hit file",
+                          default='')
+
+    sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
+                          help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
+                          default='no_sample_id')
+
+    unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False,
+                         help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.",
+                         default='')
+
+    unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False,
+                         help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.",
+                         default='')
+
+    splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False,
+                         help="Input path to splice junction file (currently not implemented)",
+                         default='')
+
+    chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False,
+                         help="Ouput path to SAM file containing chimeric mappings",
+                         default='')
+
+    hit_filter = cli.SwitchAttr(
+        ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
+        help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
+        default=[])
+
+    debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+
+    reads = cli.SwitchAttr(
+        ['-r', '--reads'], str, list=True, mandatory=True,
+        help="Sets the input reads. Add this parameter twice for paired end reads.")
+
+    zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
+
+    sensitive = cli.Flag(
+        ["--sensitive"], help="If given, mapping will process slower and more sensitive")
+
+    def main(self):
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        # Obtain star executable
+        star        = [self.star_path and self.star_path or 'STAR']
+        samtools    = [self.samtools_path and self.samtools_path or 'samtools']
+
+        # Check if genome directory is existing
+        if not os.path.exists(self.index_dir):
+            sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir)
+            sys.exit(1)
+
+        if self.temp_path:
+            temp_path = tempfile.mkdtemp(dir=self.temp_path)
+        else:
+            temp_path = tempfile.mkdtemp()
+
+        first_ends      = []
+        second_ends     = []
+        single_ends     = []
+
+        if len(self.reads) == 2:
+            first, second = self.reads
+            first_ends.append(first)
+            second_ends.append(second)
+
+        elif len(self.reads) == 1:
+            single_ends.append(self.reads[0])
+
+        else:
+            sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)')
+            sys.exit(1)
+
+        if single_ends and not first_ends and not second_ends:
+            reads = [','.join(single_ends)]
+
+        elif first_ends and second_ends:
+            reads = [','.join(first_ends), ','.join(second_ends)]
+
+
+        star_cline = star + ['--runMode', 'alignReads',
+                        '--genomeDir', self.index_dir,
+                        '--runThreadN', str(self.threads),
+                        '--readMatesLengthsIn', 'NotEqual',
+                        '--outFileNamePrefix', os.path.join(
+                            temp_path, 'out'),
+                        '--outSAMmode', 'Full',
+                        '--outSAMstrandField', 'None',
+                        '--outSAMattributes', 'Standard',
+                        '--outSAMunmapped', 'Within',
+                        '--outStd', 'SAM',
+                        '--outFilterMultimapNmax', '1000',
+                        '--outSAMprimaryFlag', 'AllBestScore',
+                        '--outSAMorder', 'PairedKeepInputOrder']
+
+        if self.unmapped1 or self.unmapped2:
+            star_cline += ['--outReadsUnmapped', 'Fastx']
+        else:
+            star_cline += ['--outReadsUnmapped', 'None']
+
+
+        if self.zipped:
+            star_cline += ['--readFilesCommand', 'zcat']
+
+        if self.sensitive:
+            star_cline += ['--outFilterMultimapScoreRange', '10',
+                          '--outFilterMismatchNmax', '60',
+                          '--outFilterMismatchNoverLmax', '0.3',
+                          '--outFilterScoreMin', '0',
+                          '--outFilterScoreMinOverLread', '0.3',
+                          '--outFilterMatchNmin', '0',
+                          '--outFilterMatchNminOverLread', '0.66',
+                          '--seedSearchStartLmax', '12',
+                          '--winAnchorMultimapNmax', '50']
+
+        star_cline += ['--readFilesIn'] + reads
+
+        if self.debug:
+            print ' '.join(star_cline)
+
+        # Try if we can make the relevant files
+        touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam]
+        for file_path in touch_files:
+            if file_path is None or file_path == '':
+                continue
+            try:
+                with file(file_path, 'a'):
+                    os.utime(file_path, None)
+            except IOError:
+                sys.stderr.write('Could not write output file %s\n' % file_path)
+                sys.exit(1)
+
+        star_process = subprocess.Popen(' '.join(
+                                        star_cline), shell=True, stdout=PIPE)
+
+        parser      = SAMParser()
+
+        if self.taxonomy:
+            taxonomy    = SAMTaxonomy(self.taxonomy)
+
+        if self.qual:
+            quality     = SAMQuality(self.qual)
+
+        if self.hits:
+            hits        = SAMHits(self.hits, self.sample_id, self.hit_filter,
+                        self.min_mapping_score,
+                        self.min_alignment_score,
+                        self.max_mismatches,
+                        self.max_relative_mismatches,
+                        self.min_continiously_matching,
+                        self.filter_complexity)
+
+        if self.sam:
+            sam_file = open(self.sam, 'w')
+
+        if self.bam:
+            with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
+                samtools_cline = samtools + [
+                    'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
+                if self.debug:
+                    print ' '.join(samtools_cline)
+                samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
+
+
+        do_sam = self.sam
+        do_bam = self.bam
+        do_taxonomy = self.taxonomy
+        do_qual = self.qual
+        do_hits = self.hits
+        do_parse = do_taxonomy or do_qual or do_hits
+
+        for i, line in enumerate(iter(star_process.stdout.readline, '')):
+
+            if do_sam:
+                sam_file.write(line)
+
+            if do_bam:
+                samtools_process.stdin.write(line)
+
+            if line[0] == '@':
+                continue
+
+            if do_parse:
+                parsed_line = parser.parse(line)
+
+            if  do_taxonomy:
+                taxonomy.count(parsed_line)
+                if i > 0 and (i % 50000) == 0:
+                    print ''.join(taxonomy.get_summary(10))
+
+            if do_qual:
+                quality.count(parsed_line)
+
+            if do_hits:
+                hits.count(parsed_line)
+
+        if do_bam:
+            samtools_process.stdin.close()
+
+        if do_sam:
+            sam_file.close()
+
+        if do_hits:
+            hits.write()
+
+        if do_taxonomy:
+            print ''.join(taxonomy.get_summary(10))
+            taxonomy.write()
+
+        if do_qual:
+            quality.write()
+
+        try:
+            if self.unmapped1:
+                shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\
+                self.unmapped1)
+        except IOError:
+            pass
+
+        try:
+            if self.unmapped2:
+                shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\
+                 self.unmapped2)
+        except IOError:
+            pass
+
+        try:
+            if self.chimeric_mappings:
+                shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\
+                 self.chimeric_mappings)
+        except IOError:
+            pass
+
+        shutil.rmtree(temp_path)
+
+@CLI.subcommand("dnamap")
+class DNAmap(cli.Application):
+    """ Map input reads against a BWA index """
+
+    index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+                               help="Sets the index output directory")
+
+    threads = cli.SwitchAttr(
+        ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+        help="Sets the number of threads to use",
+        default=1)
+
+    taxonomy = cli.SwitchAttr(
+        ['-x', '--taxonomy'], str, mandatory=False,
+        help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
+        default='')
+
+    samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
+                          help="Path to samtools executable",
+                          default='')
+
+    temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
+                          help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
+                          default='')
+
+    min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
+                          help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
+                          help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False,
+                          help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
+                          help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
+                          help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
+                          default=None)
+
+    filter_complexity = cli.Flag(['--filter_complexity'],
+                          help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
+                          default=False)
+
+    sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
+                          help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
+                          default='no_sample_id')
+
+    bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
+                         help="Path to unsorted, unindexed output BAM file",
+                         default='')
+
+    sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
+                         help="Path to output SAM file",
+                         default='')
+
+    qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
+                         help="Path to output quality file",
+                         default='')
+
+    hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
+                          help="Path to bzip2-compressed tab-delimited output virana hit file",
+                          default='')
+
+    hit_filter = cli.SwitchAttr(
+        ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
+        help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
+        default=[])
+
+    debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+
+    zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
+
+    sensitive = cli.Flag(
+        ["--sensitive"], help="If given, mapping will process slower and more sensitive")
+
+
+    bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False,
+                          help="Path to BWA executable",
+                          default='')
+
+    debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+
+    if debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    reads = cli.SwitchAttr(
+        ['-r', '--reads'], str, list=True, mandatory=True,
+        help="Sets the input reads. Add this parameter twice for paired end reads.")
+
+    def main(self):
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        # Obtain star executable
+        bwa         = [self.bwa_path and self.bwa_path or 'bwa']
+        samtools    = [self.samtools_path and self.samtools_path or 'samtools']
+
+        # Check if genome directory is existing
+        if not os.path.exists(self.index_dir):
+            sys.stdout.write('Index directory %s not existing, exiting'\
+                % self.genome_dir)
+            sys.exit(1)
+
+        if len(self.reads) not in (1, 2):
+            message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n'
+            sys.stderr.write(message)
+            sys.exit(1)
+
+        bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')]
+
+        bwa_cline += self.reads
+
+        if self.debug:
+            print ' '.join(bwa_cline)
+
+        bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE)
+
+        parser      = SAMParser()
+
+        if self.taxonomy:
+            taxonomy    = SAMTaxonomy(self.taxonomy)
+
+        if self.qual:
+            quality     = SAMQuality(self.qual)
+
+        if self.hits:
+            hits        = SAMHits(self.hits, self.sample_id, self.hit_filter,
+                        self.min_mapping_score,
+                        self.min_alignment_score,
+                        self.max_mismatches,
+                        self.max_relative_mismatches,
+                        self.min_continiously_matching,
+                        self.filter_complexity)
+
+        if self.sam:
+            sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024)
+
+        if self.bam:
+            with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
+                samtools_cline = samtools + [
+                    'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
+                if self.debug:
+                    print ' '.join(samtools_cline)
+                samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
+
+        do_sam = self.sam
+        do_bam = self.bam
+        do_taxonomy = self.taxonomy
+        do_qual = self.qual
+        do_hits = self.hits
+        do_parse = do_taxonomy or do_qual or do_hits
+
+        for i, line in enumerate(iter(bwa_process.stdout.readline, '')):
+
+            if do_sam:
+                sam_file.write(line)
+
+            if do_bam:
+                samtools_process.stdin.write(line)
+
+            if do_parse:
+                parsed_line = parser.parse(line)
+
+            if do_taxonomy:
+                taxonomy.count(parsed_line)
+
+                if i > 0 and (i % 10000) == 0:
+                    print ''.join(taxonomy.get_summary(10))
+
+            if do_qual:
+                quality.count(parsed_line)
+
+            if do_hits:
+                hits.count(parsed_line)
+
+        if do_bam:
+            samtools_process.stdin.close()
+
+        if do_sam:
+            sam_file.close()
+
+        if do_hits:
+            hits.write()
+
+        if do_taxonomy:
+            print ''.join(taxonomy.get_summary(10))
+            taxonomy.write()
+
+        if do_qual:
+            quality.write()
+
+
+if __name__ == "__main__":
+    CLI.run()
diff -r 000000000000 -r 3ba5983012cf vmap_dnaindex.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmap_dnaindex.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,30 @@
+from argparse import ArgumentParser
+import os
+from vmap import DNAIndex
+
+if __name__ == "__main__":
+    
+    
+    parser = ArgumentParser()
+    
+    a = parser.add_argument
+    a("-o","--html_file",dest="html_file")
+    a("-d","--dir",dest="directory")
+      
+    (options,args)= parser.parse_known_args()
+    
+    args.insert(0,"dummy")
+    try:
+        DNAIndex.run(argv=args)
+    except SystemExit:
+        f = open(options.html_file,'w')
+        rval = ["
BWA Index Galaxy Composite Dataset  
\n"]
+        rval.append('
This composite dataset is composed of the following files:
')
+        flist = os.listdir(options.directory)
+
+        for file in flist:
+            rval.append( '%s  ' )
+    
+        f.write("\n".join( rval ))
+        f.close() 
diff -r 000000000000 -r 3ba5983012cf vmap_dnamap.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmap_dnamap.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,164 @@
+
+
+Map input reads against a BWA index.  
+
+	
+        
+        samtools 
+        bwa 
+        virana_python 
+        numpy 
+        biopython 
+        htseq 
+     
+
+    vmap.py dnamap
+    
+    
+    
+    --reads $reads
+    
+    #if $paired.select_paired == "paired"
+        --reads $paired.paired_read
+    #end if
+    
+    #if $refseq
+        #set $groups = str($refseq).split(",")
+        #for $ref in $groups:
+            --virana_hit_filter $ref
+        #end for
+    #end if
+        
+
+        #if $bam
+            --bam $bam_output
+        #end if
+        
+        #if $sam
+            --sam $sam_output
+        #end if
+        
+        #if $output_type.hit
+            --virana_hits $hit_output
+            
+            #if $output_type.filter_complexity
+                --filter_complexity
+            #end if
+            
+            #if $output_type.max_mismatches
+                --max_mismatches $output_type.max_mismatches
+            #end if
+            
+            #if $output_type.max_relative_mismatches
+                --max_relative_mismatches $output_type.max_relative_mismatches
+            #end if
+            
+            #if $output_type.min_alignment_score
+                --min_alignment_score $output_type.min_alignment_score
+            #end if
+            
+            #if $output_type.min_continiously_matching
+                --min_continiously_matching $output_type.min_continiously_matching
+            #end if
+            
+            #if $output_type.min_mapping_score
+                --min_mapping_score $output_type.min_mapping_score
+            #end if
+       #end if
+       #if $qual
+            --qual $qual_output 
+       #end if
+        #if $tax
+            --taxonomy $tax_output  
+        #end if
+    
+    
+    #if $sample_id:
+        --sample_id $sample_id
+    #end if 
+    
+    --index_dir $index.extra_files_path
+    
+    $sensitive
+    
+    2>&1
+        
+     
+    
+    
+        
+            single end 
+                paired end 
+            
+            
+           
+                 
+        
+         
+        
+            
+                 
+        
+         
+        
+        UniVec 
+            rRNA 
+            Homo sapiens 
+            Fungi 
+            Plasmids 
+            Protozoa 
+            Viruses 
+            Homo sapiens cDNA 
+            Fungi DRAFT 
+            Bacteria 
+            Bacteria DRAFT 
+        
+        
+         
+    
+    
+    
+        
+            output_type['hit'] 
+         
+        
+            bam 
+         
+        
+            sam 
+         
+        
+            qual 
+         
+        
+            tax' 
+         
+     
+ 
\ No newline at end of file
diff -r 000000000000 -r 3ba5983012cf vmap_rnaindex.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmap_rnaindex.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,29 @@
+from argparse import ArgumentParser
+import os
+from vmap import RNAIndex
+
+if __name__ == "__main__":
+    
+    
+    parser = ArgumentParser()
+    
+    a = parser.add_argument
+    a("-o","--html_file",dest="html_file")
+    a("-d","--dir",dest="directory")
+      
+    (options,args)= parser.parse_known_args()
+    
+    args.insert(0,"dummy")
+    try:
+        RNAIndex.run(argv=args)
+    except SystemExit:
+        f = open(options.html_file,'w')
+        rval = ["
STAR Index Galaxy Composite Dataset  
\n"]
+        rval.append('
This composite dataset is composed of the following files:
')
+        flist = os.listdir(options.directory)
+        for file in flist:
+           rval.append( '%s  ' )
+    
+        f.write("\n".join( rval ))
+        f.close() 
diff -r 000000000000 -r 3ba5983012cf vmap_rnamap.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vmap_rnamap.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,186 @@
+
+
+Map input reads against a STAR index.  
+
+	
+        STAR 
+        samtools 
+        virana_python 
+        numpy 
+        biopython 
+        htseq 
+     
+
+    vmap.py rnamap
+    
+    
+    
+    --reads $reads
+    
+    #if $paired.select_paired == "paired"
+        --reads $paired.paired_read
+    #end if
+    
+    #if $refseq
+        #set $groups = str($refseq).split(",")
+        #for $ref in $groups:
+            --virana_hit_filter $ref
+        #end for
+    #end if
+        
+
+        #if $bam
+            --bam $bam_output
+        #end if
+        
+        #if $sam
+            --sam $sam_output
+        #end if
+        
+        #if $output_type.hit
+            --virana_hits $hit_output
+            
+            $output_type.filter_complexity
+            
+            #if $output_type.max_mismatches
+                --max_mismatches $output_type.max_mismatches
+            #end if
+            
+            #if $output_type.max_relative_mismatches
+                --max_relative_mismatches $output_type.max_relative_mismatches
+            #end if
+            
+            #if $output_type.min_alignment_score
+                --min_alignment_score $output_type.min_alignment_score
+            #end if
+            
+            #if $output_type.min_continiously_matching
+                --min_continiously_matching $output_type.min_continiously_matching
+            #end if
+            
+            #if $output_type.min_mapping_score
+                --min_mapping_score $output_type.min_mapping_score
+            #end if
+       #end if
+       #if $qual
+            --qual $qual_output
+       #end if
+        
+       #if $tax
+            --taxonomy $tax_output
+       #end if
+        #if $chimeric
+            --chimeric_mappings $chimeric_output
+        #end if
+        #if $unmapped
+            --unmapped_end_1 $unmapped1_output
+            #if $paired.select_paired == "paired"
+                --unmapped_end_2 $unmapped2_output
+            #end if
+        #end if
+    
+    #if $sample_id:
+        --sample_id $sample_id
+    #end if 
+        
+    #if $splice_junctions:
+        --splice_junctions $splice_junctions
+    #end if
+    
+    --index_dir $index.extra_files_path
+    
+    $sensitive
+    
+    2>&1
+        
+     
+    
+    
+        
+            single end 
+                paired end 
+            
+            
+           
+                 
+        
+         
+        
+            
+                 
+        
+         
+        
+        UniVec 
+            rRNA 
+            Homo sapiens 
+            Fungi 
+            Plasmids 
+            Protozoa 
+            Viruses 
+            Homo sapiens cDNA 
+            Fungi DRAFT 
+            Bacteria 
+            Bacteria DRAFT 
+        
+        
+         
+    
+    
+    
+        
+            output_type['hit'] 
+         
+        
+            bam 
+         
+        
+            sam 
+         
+        
+            qual 
+         
+        
+            tax 
+         
+        
+            chimeric 
+         
+        
+            unmapped 
+         
+        
+            unmapped 
+             paired['select_paired'] == 'paired' 
+         
+     
+ 
diff -r 000000000000 -r 3ba5983012cf vref.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vref.py	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,582 @@
+#!/usr/bin/env python
+
+import sys
+import logging
+import gzip
+import string
+import os
+import tarfile
+
+from io import BytesIO
+from collections import defaultdict
+
+try:
+    from Bio.SeqRecord import SeqRecord
+    from Bio import SeqIO
+except ImportError:
+    message = 'This script requires the BioPython python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    from plumbum import cli
+except ImportError:
+    message = 'This script requires the plumbum python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+try:
+    import ftputil
+except ImportError:
+    message = 'This script requires the ftputil python package\n'
+    sys.stderr.write(message)
+    sys.exit(1)
+
+
+NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
+NON_ID = NON_ID.replace('_', '').replace('-', '')
+NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN')
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+class DataDownloader(object):
+    """ Generic class for downloading fasta records via FTP. """
+
+    def __init__(self, host_name, base_path, user='anonymous', password='anonymous'):
+
+        self.group     = 'NoGroup'
+        self.host_name = host_name
+        self.base_path = base_path
+
+        self.host = ftputil.FTPHost(host_name, user, password)
+        self.host.chdir(base_path)
+
+    @classmethod
+    def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''):
+
+        record.seq._data = record.seq._data.translate(NUC_TRANS)
+
+        group       = group.translate(None, NON_ID)
+        family      = family.translate(None, NON_ID)
+        organism    = organism.translate(None, NON_ID)
+        identifier  = identifier.translate(None, NON_ID)
+
+        if sub_identifier:
+            sub_identifier = sub_identifier.translate(None, NON_ID)
+            record.id  = ';'.join([group, family, organism, identifier, sub_identifier])
+        else:
+            record.id           = ';'.join([group, family, organism, identifier])
+
+        record.description  = description
+        record.name         = ''
+
+        return record
+
+    def get_fasta_records(self):
+
+        raise NotImplementedError
+
+class SilvaDownlaoder(DataDownloader):
+    """ Downlaods rRNA transcripts from the Silva database. """
+
+    def __init__(self, silva_release='111'):
+
+        self.silva_release      = silva_release
+        self.silva_host_name    = 'ftp.arb-silva.de'
+        self.base_path          = '/release_%s/Exports/' % self.silva_release
+
+        super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path)
+
+        self.group = 'rRNA'
+
+    def get_fasta_records(self):
+
+        remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release
+        if self.host.path.isfile(remote_path):
+
+            with self.host.file(remote_path, 'rb') as remote_handle:
+                remote_content = BytesIO(remote_handle.read())
+
+                with tarfile.open(fileobj=remote_content) as tar:
+
+                    for subentry in tar.getnames():
+                        if subentry.endswith('.fasta'):
+                            logging.debug('Processing rRNA file %s' % subentry)
+                            subhandle = tar.extractfile(subentry)
+
+                            for record in SeqIO.parse(subhandle, "fasta"):
+                                identifier  = record.id.split('.')[0]
+                                description = '_'.join(record.description.split(' ')[1:])
+                                taxonomy    = description.split(';')
+                                organism    = taxonomy[-1]
+                                family      = 'Unassigned'
+
+                                for level in taxonomy:
+                                    if level.endswith('eae'):
+                                        family = level
+                                        break
+
+                                yield self._get_fasta_record(record, self.group, family, organism, identifier)
+
+class HumanTranscriptDownloader(DataDownloader):
+    """ Downloads human transcripts fasta records that contain the locations
+        of their exons on the human genome within their description line.
+    """
+
+    def __init__(self):
+
+        self.ensemble_host_name  = 'ftp.ensembl.org'
+        self.gtf_path            = '/pub/current_gtf/homo_sapiens'
+        self.cdna_path           = '/pub/current_fasta/homo_sapiens/cdna'
+        self.ncrna_path          = '/pub/current_fasta/homo_sapiens/ncrna'
+
+        super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path)
+
+        self.chromosomes    = set()
+        self.genes          = defaultdict(set)
+        self.regions        = defaultdict(set)
+        self.group          = 'Homo_sapiens_cDNA'
+
+    def _cache_annotations(self):
+        """ Obtains gtf annotations for human transcripts from ensemble and
+            caches them in memory.
+        """
+
+        self.host.chdir(self.gtf_path)
+
+        entries = self.host.listdir(self.host.curdir)
+
+        for entry in entries:
+
+            if self.host.path.isfile(entry):
+                if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'):
+                    logging.debug('Processing cDNA annotations %s' % entry)
+
+                    with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle:
+                        remote_content = BytesIO(zipped_handle.read())
+
+                        with gzip.GzipFile(fileobj=remote_content) as gzipfile:
+                            for i, gtf_line in enumerate(gzipfile):
+                                if i % 100000 == 0:
+                                    logging.debug('Cached %i human transcript annotations' % i)
+
+                                self._add_annotation(gtf_line)
+
+    def _get_raw_transcripts(self):
+        """ Obtains coding and noncoding human transcripts from ensemble and
+            yields them as fasta records if they have exons that have a known location
+            on the human genome. Fasta records contain the exon locations in their header
+            lines.
+        """
+
+        for subpath in [self.cdna_path, self.ncrna_path]:
+
+            self.host.chdir(subpath)
+
+            entries = self.host.listdir(self.host.curdir)
+
+            for entry in entries:
+                if self.host.path.isfile(entry):
+                    if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'):
+
+                        logging.debug('Processing human transcript file %s' % entry)
+
+                        with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle:
+                            remote_content = BytesIO(zipped_handle.read())
+
+                            with gzip.GzipFile(fileobj=remote_content) as gzipfile:
+                                for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")):
+
+                                    if record.id.startswith('ENST'):
+
+                                        if i % 100000 == 0:
+                                            logging.debug('Retrieved %i human transcripts' % i)
+
+                                        record.description = ''
+
+                                        yield record
+
+    def _add_annotation(self, gtf_line):
+        """ Parses gtf annotations and stores them in memory. """
+
+        if gtf_line[0] == '#':
+            return
+
+        fields = gtf_line.strip().split('\t')
+        chromosome, source, feature, start, end, score, strands, frame, attributes\
+                                                                    = fields
+
+        start, end = sorted([int(start), int(end)])
+
+        if feature != 'exon':
+            return
+
+        attributes = attributes.replace(' ', '').split(';')
+        transcript_ids = [identifier.split('"')[1] for identifier in attributes\
+                                if identifier.startswith('transcript_id')]
+
+        gene_names = [identifier.split('"')[1] for identifier in attributes\
+                                if identifier.startswith('gene_name')]
+
+        for transcript_id in transcript_ids:
+            self.genes[transcript_id] = self.genes[transcript_id].union(gene_names)
+            self.regions[transcript_id].add((chromosome, start, end))
+
+    def get_fasta_records(self):
+        """ Yields annotated fasta records of human transcripts. """
+
+        logging.debug('Caching annotations')
+        self._cache_annotations()
+
+        logging.debug('Downloading and annotating transcripts')
+        for record in self._get_raw_transcripts():
+            transcript_id   = record.id
+
+            if transcript_id in self.regions:
+
+                mapping_locations = self.regions[transcript_id]
+
+                description = '|'.join([';'.join(map(str, location))\
+                        for location in mapping_locations])
+
+                # Obtain gene name for transcript
+                gene_names = self.genes[transcript_id]
+                if gene_names:
+                    gene_name = list(gene_names)[0]
+                else:
+                    gene_name = transcript_id
+
+                yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description)
+
+class RefSeqDownloader(DataDownloader):
+    """ Generic downloader that known how to interpret and parse genbank records """
+
+    def __init__(self, group):
+
+        self.refseq_host_name  = 'ftp.ncbi.nih.gov'
+        self.base_path         = '/genomes/' + group
+
+        super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path)
+
+        self.group              = group
+
+
+    @classmethod
+    def _get_project_from_genbank(self, gb_record):
+        """ Parses genbank project identifier from genbank record. """
+
+        project = ''
+        for dbxref in gb_record.dbxrefs:
+            for entry in dbxref.split(' '):
+                if entry.startswith('BioProject'):
+                    project = entry.split(':')[1]
+                    return project
+
+        if not project:
+            for dbxref in gb_record.dbxrefs:
+                for entry in dbxref.split(' '):
+                    if entry.startswith('Project'):
+                        project = entry.split(':')[1]
+                        return project
+
+        return project
+
+    @classmethod
+    def _get_annotations_from_genbank(self, gb_record):
+        """ Parses taxonomic annotation from genbank record. """
+
+        accession   = gb_record.annotations['accessions'][0]
+        organism    = gb_record.annotations['organism'].replace(' ', '_')
+        organism    = '_'.join(organism.split('_')[:2])
+        taxonomy    = ("; ").join(gb_record.annotations['taxonomy'])
+        gi          = gb_record.annotations['gi']
+
+        family      = 'Unassigned'
+        if len(gb_record.annotations['taxonomy']) > 0:
+            for level in gb_record.annotations['taxonomy']:
+                if level.endswith('dae') or level.endswith('aceae'):
+                    family = level
+                    break
+
+        if family == 'Unassigned':
+            logging.debug('Unassigned family found based on %s' % taxonomy)
+
+        return organism, accession, gi, taxonomy, family
+
+    @classmethod
+    def _get_record_from_genbank(self, gb_record):
+        """ Parses sequence from genbank record. """
+
+        return SeqRecord(gb_record.seq)
+
+    @classmethod
+    def _parse_genbank_records(self, file_handle):
+
+        for gb_record in SeqIO.parse(file_handle, 'genbank'):
+
+            project = self._get_project_from_genbank(gb_record)
+            organism, accession, gi, taxonomy, family =\
+                self._get_annotations_from_genbank(gb_record)
+            record = self._get_record_from_genbank(gb_record)
+
+            if len(record) > 0:
+                yield record, project, taxonomy, family, organism, gi
+
+    def _get_nc(self, entry_path):
+
+        if self.host.path.isfile(entry_path):
+            logging.debug('Processing refseq entry %s' % entry_path)
+            remote_handle = self.host.file(entry_path)
+            
+
+            for record, project, taxonomy, family, organism, gi\
+                in self._parse_genbank_records(remote_handle):
+
+                cleaned = self._get_fasta_record(record, self.group , family, organism, gi)
+
+                yield cleaned
+
+    def _get_nz(self, entry_path):
+
+        scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz')
+        if self.host.path.isfile(scaffold_path):
+            logging.debug('Processing refseq entry %s' % scaffold_path)
+            with self.host.file(scaffold_path, 'rb') as remote_handle:
+                remote_content = BytesIO(remote_handle.read())
+                with tarfile.open(fileobj=remote_content) as tar:
+                    for subentry in tar.getnames():
+                        if subentry.endswith('.gbk'):
+                            logging.debug('Processing subaccession %s' % subentry)
+                            subhandle = tar.extractfile(subentry)
+                            for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle):
+                                yield self._get_fasta_record(record, self.group , family, organism, gi)
+
+
+    def get_fasta_records(self):
+
+        species_dirs = self.host.listdir(self.host.curdir)
+
+        for species_dir in species_dirs:
+
+            species_path = self.base_path + '/' + species_dir
+
+            if self.host.path.isdir(species_path):
+
+                self.host.chdir(species_path)
+
+                logging.debug('Processing species directory %s' % species_path)
+
+                if self.host.path.isdir(species_path):
+
+                    self.host.chdir(species_path)
+
+                    entries = self.host.listdir(self.host.curdir)
+
+                    for entry in entries:
+
+                        if self.host.path.isfile(self.host.curdir + '/' + entry):
+
+                            if entry.endswith('.gbk') or entry.endswith('.gbk.gz'):
+
+                                logging.debug('Processing entry %s' % entry)
+
+                                entry_path = species_path + '/' + entry
+
+                                if entry.startswith('NC_'):
+                                    for record in self._get_nc(entry_path):
+                                        yield record
+
+                                elif entry.startswith('NZ_'):
+                                    for record in self._get_nz(entry_path):
+                                        yield record
+
+class HumanGenomeDownloader(DataDownloader):
+
+    def __init__(self):
+
+        self.ensemble_host_name  = 'ftp.ensembl.org'
+        self.base_path           = '/pub/current_fasta/homo_sapiens/dna'
+
+        super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path)
+
+        self.group          = 'Homo_sapiens'
+
+    def get_fasta_records(self):
+
+        entries = self.host.listdir(self.host.curdir)
+        for entry in entries:
+
+            if self.host.path.isfile(entry):
+
+                # You may want to change your filtering criteria here if you would like to include patches and haplotypes
+                if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry:
+
+                    logging.debug('Processing human entry %s' % entry)
+
+                    with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle:
+                        remote_content = BytesIO(zipped_handle.read())
+
+                        with gzip.GzipFile(fileobj=remote_content) as gzipfile:
+
+                            for record in SeqIO.parse(gzipfile, "fasta"):
+
+                                identifier      = record.id
+                                organism        = 'Homo_sapiens'
+                                family          = 'Hominidae'
+
+                                yield self._get_fasta_record(record, self.group, family, organism, identifier)
+
+class UniVecDownloader(DataDownloader):
+
+    def __init__(self):
+
+        self.univec_host_name  = 'ftp.ncbi.nih.gov'
+        self.base_path         = '/pub/UniVec'
+
+        super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path)
+
+        self.group          = 'UniVec'
+
+    def get_fasta_records(self):
+
+        logging.debug('Processing Univec entries')
+
+        with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle:
+            for record in SeqIO.parse(remote_handle, "fasta"):
+                organism    = '_'.join(record.description.split(' ')[1:])
+                family      = 'Unassigned'
+                identifier  = '000000000'
+
+                yield self._get_fasta_record(record, self.group, family, organism, identifier)
+
+class CLI(cli.Application):
+    """"""
+    PROGNAME = "metamap"
+    VERSION = "1.0.0"
+    DESCRIPTION = """"""
+
+    def main(self, *args):
+
+        if args:
+            print("Unknown command %r" % (args[0]))
+            print self.USAGE
+            return 1
+
+        if not self.nested_command:
+            print("No command given")
+            print self.USAGE
+            return 1
+
+@CLI.subcommand("fasta")
+class References(cli.Application):
+    """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts"""
+
+    valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria',
+              'Bacteria_DRAFT', 'Homo_sapiens',
+              'Viruses', 'UniVec', 'Plasmids',
+              'Protozoa', 'rRNA', 'Homo_sapiens_cDNA']
+
+    references = cli.SwitchAttr(['-r', '--reference'],
+                                   cli.Set(*valid_references, case_sensitive=True),
+                                   list=True, default=valid_references,
+                                   mandatory=False,
+                                   help="Sets the kind of references to obtain")
+
+    fasta_path  = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True,
+                                 help="Sets the fasta output file")
+
+    zipped      = cli.Flag(["-z", "--zipped"], help="Write gzipped output")
+    debug       = cli.Flag(["-d", "--debug"], help="Enable debug messages")
+
+
+    def main(self):
+        """ Downloads genome fasta files from reference databases"""
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)):
+            logging.debug('Making directories for output file %s' % self.fasta_path)
+            os.makedirs(os.path.dirname(self.fasta_path))
+
+        if self.zipped:
+            logging.debug('Making zipped output file %s' % self.fasta_path)
+            output_file = gzip.open(self.fasta_path, 'wb')
+
+        else:
+            logging.debug('Making regular output file %s' % self.fasta_path)
+            output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024)
+
+
+        for reference in self.references:
+
+            if reference == 'UniVec':
+                downloader = UniVecDownloader()
+            elif reference == 'Homo_sapiens':
+                downloader = HumanGenomeDownloader()
+            elif reference == 'rRNA':
+                downloader = SilvaDownlaoder()
+            elif reference == 'Homo_sapiens_cDNA':
+                downloader = HumanTranscriptDownloader()
+            else:
+                downloader = RefSeqDownloader(group=reference)
+
+            for record in downloader.get_fasta_records():
+                SeqIO.write([record], output_file, "fasta")
+
+        output_file.close()
+
+@CLI.subcommand("blast")
+class Blast(cli.Application):
+    """ Obtains blast databases """
+
+    valid_references = ['nt', 'nr']
+
+    references = cli.SwitchAttr(['-r', '--reference_database'],
+                                   cli.Set(*valid_references, case_sensitive=True),
+                                   list=True, default=valid_references,
+                                   mandatory=False,
+                                   help="Sets the kind of database to obtain")
+
+    database_path  = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True,
+                                 help="Sets the fasta output file")
+
+    debug       = cli.Flag(["-d", "--debug"], help="Enable debug messages")
+
+    def main(self):
+        """ Downloads blast database files """
+
+        if self.debug:
+            logging.getLogger().setLevel(logging.DEBUG)
+
+        if not os.path.exists(self.database_path):
+            logging.debug('Making output directory %s' % self.database_path)
+            os.makedirs(self.database_path)
+
+        host_name = 'ftp.ncbi.nih.gov'
+        host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous')
+
+        for reference in self.references:
+
+            path = '/blast/db'
+            host.chdir(path)
+
+            entries = host.listdir(host.curdir)
+
+            for entry in entries:
+
+                if host.path.isfile(entry):
+
+                    if entry.startswith(reference) and entry.endswith('.tar.gz'):
+                        logging.debug('Downloading %s' % entry)
+                        local_path = os.path.join(self.database_path, entry)
+                        host.download(entry, local_path, 'b')
+                        logging.debug('Unpacking %s' % entry)
+                        tfile = tarfile.open(local_path, 'r:gz')
+                        tfile.extractall(self.database_path)
+                        os.remove(local_path)
+
+if __name__ == "__main__":
+    CLI.run()
+
diff -r 000000000000 -r 3ba5983012cf vref_blast.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vref_blast.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,39 @@
+
+
+Obtains blast databases.  
+
+	
+        virana_python 
+        numpy 
+        biopython 
+
+     
+
+    vref.py blast --output_path $output 
+    
+    #if $database
+        #set $groups = str($database).split(",")
+        #for $ref in $groups:
+            --reference_database $ref
+        #end for
+    #end if
+    
+    
+     
+    
+    
+    
+        nr 
+            nt 
+        
+
+    
+     
+    
+        
+    
+         
+
+ 
\ No newline at end of file
diff -r 000000000000 -r 3ba5983012cf vref_fasta.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vref_fasta.xml	Tue Sep 24 10:19:40 2013 -0400
@@ -0,0 +1,50 @@
+
+
+Obtains NCBI RefSeq and Ensembl reference genomes.  
+
+	
+        virana_python 
+        numpy 
+        biopython 
+
+     
+
+    vref.py fasta --output_file $output 
+        
+    #if $refseq
+        #set $groups = str($refseq).split(",")
+        #for $ref in $groups:
+            --reference $ref
+        #end for
+    #end if
+    
+    
+    $zipped
+       
+    2>&1
+     
+    
+    
+        UniVec 
+            rRNA 
+            Homo sapiens 
+            Fungi 
+            Plasmids 
+            Protozoa 
+            Viruses 
+            Homo sapiens cDNA 
+            Fungi DRAFT 
+            Bacteria 
+            Bacteria DRAFT 
+        
+        
+         
+    
+    
+    
+         
+ 
\ No newline at end of file