| 1 | #! /usr/bin/perl |
| 2 | |
| 3 | # Script to tidy up the output of w3m when it makes a text file. First we |
| 4 | # convert sequences of blank lines into a single blank line, to get everything |
| 5 | # uniform. Then we go through and insert blank lines before chapter and |
| 6 | # sections, also converting chapter titles to uppercase. |
| 7 | |
| 8 | # We also have to do some character translation in the first pass. It seems |
| 9 | # that xmlto now generates Unicode in its HTML pages. This gives three problems: |
| 10 | # (1) It inserts the byte sequence C2 A0 (U+00A0) as a fixed-width space; |
| 11 | # (2) It uses U+25CF as its bullet character. |
| 12 | # (3) It inserts a whole slew of "box drawing" characters round the heading. |
| 13 | |
| 14 | @lines = <>; |
| 15 | |
| 16 | $lastwasblank = 0; |
| 17 | foreach $line (@lines) |
| 18 | { |
| 19 | # (1) non-break space -> normal space |
| 20 | $line =~ s/\x{c2}\x{a0}/ /g; |
| 21 | # (2) bullet -> asterisk |
| 22 | $line =~ s/\x{e2}\x{97}\x{8f}/*/g; |
| 23 | $line =~ s/\x{e2}\x{80}\x{a2}/*/g; # OpenSUSE |
| 24 | $line =~ s/\x{e2}\x{96}\x{a1}/*/g; # OpenSUSE |
| 25 | # (3a) horizontal box drawing -> hyphen |
| 26 | $line =~ s/\x{e2}\x{94}[\x{80}\x{81}\x{84}\x{85}\x{88}\x{89}]/-/g; |
| 27 | $line =~ s/\x{e2}\x{95}[\x{8c}\x{8d}\x{90}]/-/g; |
| 28 | $line =~ s/\x{e2}\x{95}[\x{b4}\x{b6}\x{b8}\x{ba}\x{bc}\x{be}]/-/g; |
| 29 | # (3b) vertical box drawing -> bar |
| 30 | $line =~ s/\x{e2}\x{94}[\x{82}\x{83}\x{86}\x{87}\x{8a}\x{8b}]/|/g; |
| 31 | $line =~ s/\x{e2}\x{95}[\x{8e}\x{8f}\x{91}]/|/g; |
| 32 | $line =~ s/\x{e2}\x{95}[\x{b5}\x{b7}\x{b9}\x{bb}\x{bd}\x{bf}]/|/g; |
| 33 | # (3c) corner box drawing -> plus |
| 34 | $line =~ s/\x{e2}\x{94}[\x{8c}-\x{bf}]/+/g; |
| 35 | $line =~ s/\x{e2}\x{95}[\x{80}-\x{8b}\x{92}-\x{b0}]/+/g; |
| 36 | # other |
| 37 | $line =~ s/\x{e2}\x{95}\x{b1}/\//g; |
| 38 | $line =~ s/\x{e2}\x{95}\x{b2}/\\/g; |
| 39 | $line =~ s/\x{e2}\x{95}\x{b3}/X/g; |
| 40 | |
| 41 | # w3m rendering issue apparently only seen by pdp |
| 42 | # affects section numbers after the ToC, some info on spool-file -lines, etc |
| 43 | # always appears to be a spurious extra character, safely just dropped. |
| 44 | $line =~ s/\x{c2}//g; |
| 45 | |
| 46 | if ($line =~ /^\s*$/) |
| 47 | { |
| 48 | $line = "" if $lastwasblank; |
| 49 | $lastwasblank = 1; |
| 50 | next; |
| 51 | } |
| 52 | $lastwasblank = 0; |
| 53 | } |
| 54 | |
| 55 | # Find start of TOC, uppercasing its title |
| 56 | |
| 57 | for ($i = 0; $i < scalar @lines; $i++) |
| 58 | { |
| 59 | $lines[$i] = "TABLE OF CONTENTS\n" if $lines[$i] =~ /^Table of Contents/; |
| 60 | last if $lines[$i] =~ /^1\. /; |
| 61 | } |
| 62 | |
| 63 | # Find start of first chapter |
| 64 | |
| 65 | for ($i++; $i < scalar @lines; $i++) |
| 66 | { last if $lines[$i] =~ /^1\. /; } |
| 67 | |
| 68 | # Process the body. We can detect the starts of chapters and sections by |
| 69 | # looking for preceding and following blank lines, and then matching against |
| 70 | # the numbers. |
| 71 | |
| 72 | $chapter = 0; |
| 73 | for (; $i < scalar @lines; $i++) |
| 74 | { |
| 75 | next if $lines[$i-1] !~ /^$/ || $lines[$i+1] !~ /^$/; |
| 76 | |
| 77 | # Start of chapter |
| 78 | |
| 79 | if ($lines[$i] =~ /^(\d+)\. / && $1 == $chapter + 1) |
| 80 | { |
| 81 | $chapter++; |
| 82 | $section = 0; |
| 83 | $lines[$i] = "\n\n" . ("=" x 79) . "\n" . uc($lines[$i]); |
| 84 | } |
| 85 | |
| 86 | # Start of next section |
| 87 | |
| 88 | elsif ($lines[$i] =~ /^(\d+)\.(\d+) / && $1 == $chapter && $2 == $section + 1) |
| 89 | { |
| 90 | $section++; |
| 91 | $lines[$i] = "\n$lines[$i]" . "-" x (length($lines[$i]) - 1) . "\n"; |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | print @lines; |
| 96 | |
| 97 | # End |