Mercurial > repos > bgruening > text_processing
diff ansi2html.sh @ 3:7068d1548234 draft
Uploaded
author | bgruening |
---|---|
date | Sun, 06 Oct 2013 08:22:36 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ansi2html.sh Sun Oct 06 08:22:36 2013 -0400 @@ -0,0 +1,331 @@ +#!/bin/sh + +# Convert ANSI (terminal) colours and attributes to HTML + +# Author: +# http://www.pixelbeat.org/docs/terminal_colours/ +# Examples: +# ls -l --color=always | ansi2html.sh > ls.html +# git show --color | ansi2html.sh > last_change.html +# Generally one can use the `script` util to capture full terminal output. +# Changes: +# V0.1, 24 Apr 2008, Initial release +# V0.2, 01 Jan 2009, Phil Harnish <philharnish@gmail.com> +# Support `git diff --color` output by +# matching ANSI codes that specify only +# bold or background colour. +# P@draigBrady.com +# Support `ls --color` output by stripping +# redundant leading 0s from ANSI codes. +# Support `grep --color=always` by stripping +# unhandled ANSI codes (specifically ^[[K). +# V0.3, 20 Mar 2009, http://eexpress.blog.ubuntu.org.cn/ +# Remove cat -v usage which mangled non ascii input. +# Cleanup regular expressions used. +# Support other attributes like reverse, ... +# P@draigBrady.com +# Correctly nest <span> tags (even across lines). +# Add a command line option to use a dark background. +# Strip more terminal control codes. +# V0.4, 17 Sep 2009, P@draigBrady.com +# Handle codes with combined attributes and color. +# Handle isolated <bold> attributes with css. +# Strip more terminal control codes. +# V0.12, 12 Jul 2011 +# http://github.com/pixelb/scripts/commits/master/scripts/ansi2html.sh + +if [ "$1" = "--version" ]; then + echo "0.12" && exit +fi + +if [ "$1" = "--help" ]; then + echo "This utility converts ANSI codes in data passed to stdin" >&2 + echo "It has 2 optional parameters:" >&2 + echo " --bg=dark --palette=linux|solarized|tango|xterm" >&2 + echo "E.g.: ls -l --color=always | ansi2html.sh --bg=dark > ls.html" >&2 + exit +fi + +[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; } + +if [ "$1" = "--palette=solarized" ]; then + # See http://ethanschoonover.com/solarized + P0=073642; P1=D30102; P2=859900; P3=B58900; + P4=268BD2; P5=D33682; P6=2AA198; P7=EEE8D5; + P8=002B36; P9=CB4B16; P10=586E75; P11=657B83; + P12=839496; P13=6C71C4; P14=93A1A1; P15=FDF6E3; + shift; +elif [ "$1" = "--palette=solarized-xterm" ]; then + # Above mapped onto the xterm 256 color palette + P0=262626; P1=AF0000; P2=5F8700; P3=AF8700; + P4=0087FF; P5=AF005F; P6=00AFAF; P7=E4E4E4; + P8=1C1C1C; P9=D75F00; P10=585858; P11=626262; + P12=808080; P13=5F5FAF; P14=8A8A8A; P15=FFFFD7; + shift; +elif [ "$1" = "--palette=tango" ]; then + # Gnome default + P0=000000; P1=CC0000; P2=4E9A06; P3=C4A000; + P4=3465A4; P5=75507B; P6=06989A; P7=D3D7CF; + P8=555753; P9=EF2929; P10=8AE234; P11=FCE94F; + P12=729FCF; P13=AD7FA8; P14=34E2E2; P15=EEEEEC; + shift; +elif [ "$1" = "--palette=xterm" ]; then + P0=000000; P1=CD0000; P2=00CD00; P3=CDCD00; + P4=0000EE; P5=CD00CD; P6=00CDCD; P7=E5E5E5; + P8=7F7F7F; P9=FF0000; P10=00FF00; P11=FFFF00; + P12=5C5CFF; P13=FF00FF; P14=00FFFF; P15=FFFFFF; + shift; +else # linux console + P0=000000; P1=AA0000; P2=00AA00; P3=AA5500; + P4=0000AA; P5=AA00AA; P6=00AAAA; P7=AAAAAA; + P8=555555; P9=FF5555; P10=55FF55; P11=FFFF55; + P12=5555FF; P13=FF55FF; P14=55FFFF; P15=FFFFFF; + [ "$1" = "--palette=linux" ] && shift +fi + +[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; } + +echo -n "<html> +<head> +<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/> +<style type=\"text/css\"> +.ef0,.f0 { color: #$P0; } .eb0,.b0 { background-color: #$P0; } +.ef1,.f1 { color: #$P1; } .eb1,.b1 { background-color: #$P1; } +.ef2,.f2 { color: #$P2; } .eb2,.b2 { background-color: #$P2; } +.ef3,.f3 { color: #$P3; } .eb3,.b3 { background-color: #$P3; } +.ef4,.f4 { color: #$P4; } .eb4,.b4 { background-color: #$P4; } +.ef5,.f5 { color: #$P5; } .eb5,.b5 { background-color: #$P5; } +.ef6,.f6 { color: #$P6; } .eb6,.b6 { background-color: #$P6; } +.ef7,.f7 { color: #$P7; } .eb7,.b7 { background-color: #$P7; } +.ef8, .f0 > .bold,.bold > .f0 { color: #$P8; font-weight: normal; } +.ef9, .f1 > .bold,.bold > .f1 { color: #$P9; font-weight: normal; } +.ef10,.f2 > .bold,.bold > .f2 { color: #$P10; font-weight: normal; } +.ef11,.f3 > .bold,.bold > .f3 { color: #$P11; font-weight: normal; } +.ef12,.f4 > .bold,.bold > .f4 { color: #$P12; font-weight: normal; } +.ef13,.f5 > .bold,.bold > .f5 { color: #$P13; font-weight: normal; } +.ef14,.f6 > .bold,.bold > .f6 { color: #$P14; font-weight: normal; } +.ef15,.f7 > .bold,.bold > .f7 { color: #$P15; font-weight: normal; } +.eb8 { background-color: #$P8; } +.eb9 { background-color: #$P9; } +.eb10 { background-color: #$P10; } +.eb11 { background-color: #$P11; } +.eb12 { background-color: #$P12; } +.eb13 { background-color: #$P13; } +.eb14 { background-color: #$P14; } +.eb15 { background-color: #$P15; } +" + +# The default xterm 256 colour palette +for red in $(seq 0 5); do + for green in $(seq 0 5); do + for blue in $(seq 0 5); do + c=$((16 + ($red * 36) + ($green * 6) + $blue)) + r=$((($red * 40 + 55) * ($red > 0))) + g=$((($green * 40 + 55) * ($green > 0))) + b=$((($blue * 40 + 55) * ($blue > 0))) + printf ".ef%d { color: #%2.2x%2.2x%2.2x; } " $c $r $g $b + printf ".eb%d { background-color: #%2.2x%2.2x%2.2x; }\n" $c $r $g $b + done + done +done +for gray in $(seq 0 23); do + c=$(($gray+232)) + l=$(($gray*10 + 8)) + printf ".ef%d { color: #%2.2x%2.2x%2.2x; } " $c $l $l $l + printf ".eb%d { background-color: #%2.2x%2.2x%2.2x; }\n" $c $l $l $l +done + +echo -n ' +.f9 { color: '`[ "$dark_bg" ] && echo "#$P7;" || echo "#$P0;"`' } +.b9 { background-color: #'`[ "$dark_bg" ] && echo $P0 || echo $P15`'; } +.f9 > .bold,.bold > .f9, body.f9 > pre > .bold { + /* Bold is heavy black on white, or bright white + depending on the default background */ + color: '`[ "$dark_bg" ] && echo "#$P15;" || echo "#$P0;"`' + font-weight: '`[ "$dark_bg" ] && echo 'normal;' || echo 'bold;'`' +} +.reverse { + /* CSS doesnt support swapping fg and bg colours unfortunately, + so just hardcode something that will look OK on all backgrounds. */ + '"color: #$P0; background-color: #$P7;"' +} +.underline { text-decoration: underline; } +.line-through { text-decoration: line-through; } +.blink { text-decoration: blink; } + +</style> +</head> + +<body class="f9 b9"> +<pre> +' + +p='\x1b\[' #shortcut to match escape codes +P="\(^[^°]*\)¡$p" #expression to match prepended codes below + +# Handle various xterm control sequences. +# See /usr/share/doc/xterm-*/ctlseqs.txt +sed " +s#\x1b[^\x1b]*\x1b\\\##g # strip anything between \e and ST +s#\x1b][0-9]*;[^\a]*\a##g # strip any OSC (xterm title etc.) + +#handle carriage returns +s#^.*\r\{1,\}\([^$]\)#\1# +s#\r\$## # strip trailing \r + +# strip other non SGR escape sequences +s#[\x07]##g +s#\x1b[]>=\][0-9;]*##g +s#\x1bP+.\{5\}##g +s#${p}[0-9;?]*[^0-9;?m]##g + +#remove backspace chars and what they're backspacing over +:rm_bs +s#[^\x08]\x08##g; t rm_bs +" | + +# Normalize the input before transformation +sed " +# escape HTML +s#\&#\&#g; s#>#\>#g; s#<#\<#g; s#\"#\"#g + +# normalize SGR codes a little + +# split 256 colors out and mark so that they're not +# recognised by the following 'split combined' line +:e +s#${p}\([0-9;]\{1,\}\);\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m${p}¬\2m#g; t e +s#${p}\([34]8;5;[0-9]\{1,3\}\)m#${p}¬\1m#g; + +:c +s#${p}\([0-9]\{1,\}\);\([0-9;]\{1,\}\)m#${p}\1m${p}\2m#g; t c # split combined +s#${p}0\([0-7]\)#${p}\1#g #strip leading 0 +s#${p}1m\(\(${p}[4579]m\)*\)#\1${p}1m#g #bold last (with clr) +s#${p}m#${p}0m#g #add leading 0 to norm + +# undo any 256 color marking +s#${p}¬\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m#g; + +# map 16 color codes to color + bold +s#${p}9\([0-7]\)m#${p}3\1m${p}1m#g; +s#${p}10\([0-7]\)m#${p}4\1m${p}1m#g; + +# change 'reset' code to a single char, and prepend a single char to +# other codes so that we can easily do negative matching, as sed +# does not support look behind expressions etc. +s#°#\°#g; s#${p}0m#°#g +s#¡#\¡#g; s#${p}[0-9;]*m#¡&#g +" | + +# Convert SGR sequences to HTML +sed " +:ansi_to_span # replace ANSI codes with CSS classes +t ansi_to_span # hack so t commands below only apply to preceeding s cmd + +/^[^¡]*°/ { b span_end } # replace 'reset code' if no preceeding code + +# common combinations to minimise html (optional) +s#${P}3\([0-7]\)m¡${p}4\([0-7]\)m#\1<span class=\"f\2 b\3\">#;t span_count +s#${P}4\([0-7]\)m¡${p}3\([0-7]\)m#\1<span class=\"f\3 b\2\">#;t span_count + +s#${P}1m#\1<span class=\"bold\">#; t span_count +s#${P}4m#\1<span class=\"underline\">#; t span_count +s#${P}5m#\1<span class=\"blink\">#; t span_count +s#${P}7m#\1<span class=\"reverse\">#; t span_count +s#${P}9m#\1<span class=\"line-through\">#; t span_count +s#${P}3\([0-9]\)m#\1<span class=\"f\2\">#; t span_count +s#${P}4\([0-9]\)m#\1<span class=\"b\2\">#; t span_count + +s#${P}38;5;\([0-9]\{1,3\}\)m#\1<span class=\"ef\2\">#; t span_count +s#${P}48;5;\([0-9]\{1,3\}\)m#\1<span class=\"eb\2\">#; t span_count + +s#${P}[0-9;]*m#\1#g; t ansi_to_span # strip unhandled codes + +b # next line of input + +# add a corresponding span end flag +:span_count +x; s/^/s/; x +b ansi_to_span + +# replace 'reset code' with correct number of </span> tags +:span_end +x +/^s/ { + s/^.// + x + s#°#</span>°# + b span_end +} +x +s#°## +b ansi_to_span +" | + +# Convert alternative character set +# Note we convert here, as if we do at start we have to worry about avoiding +# conversion of SGR codes etc., whereas doing here we only have to +# avoid conversions of stuff between &...; or <...> +# +# Note we could use sed to do this based around: +# sed 'y/abcdefghijklmnopqrstuvwxyz{}`~/▒␉␌␍␊°±␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·/' +# However that would be very awkward as we need to only conv some input. +# The basic scheme that we do in the python script below is: +# 1. enable transliterate once ¡ char seen +# 2. disable once µ char seen (may be on diff line to ¡) +# 3. never transliterate between &; or <> chars +sed " +# change 'smacs' and 'rmacs' to a single char so that we can easily do +# negative matching, as sed does not support look behind expressions etc. +# Note we don't use ° like above as that's part of the alternate charset. +s#\x1b(0#¡#g; +s#µ#\µ#g; s#\x1b(B#µ#g +" | +( +python -c " +# vim:fileencoding=utf8 + +import sys +import locale +encoding=locale.getpreferredencoding() + +old='abcdefghijklmnopqrstuvwxyz{}\`~' +new='▒␉␌␍␊°±␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·' +new=unicode(new, 'utf-8') +table=range(128) +for o,n in zip(old, new): table[ord(o)]=n + +(STANDARD, ALTERNATIVE, HTML_TAG, HTML_ENTITY) = (0, 1, 2, 3) + +state = STANDARD +last_mode = STANDARD +for c in unicode(sys.stdin.read(), encoding): + if state == HTML_TAG: + if c == '>': + state = last_mode + elif state == HTML_ENTITY: + if c == ';': + state = last_mode + else: + if c == '<': + state = HTML_TAG + elif c == '&': + state = HTML_ENTITY + elif c == u'¡' and state == STANDARD: + state = ALTERNATIVE + last_mode = ALTERNATIVE + continue + elif c == u'µ' and state == ALTERNATIVE: + state = STANDARD + last_mode = STANDARD + continue + elif state == ALTERNATIVE: + c = c.translate(table) + sys.stdout.write(c.encode(encoding)) +" 2>/dev/null || +sed 's/[¡µ]//g' # just strip aternative flag chars +) + +echo "</pre> +</body> +</html>"