| 1 | #! /usr/bin/perl |
| 2 | |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | use Getopt::Long; |
| 6 | |
| 7 | |
| 8 | # For now we can't rely on a perl >= 5.14 on |
| 9 | # the build sites, thus we throw away all unicode |
| 10 | # awareness and do the matching byte by byte |
| 11 | binmode STDIN; |
| 12 | binmode STDOUT; |
| 13 | |
| 14 | GetOptions( |
| 15 | 'u|utf8!' => \my $want_utf8, # do not replace unicode characters |
| 16 | ) or die "Usage: $0 [-u|--utf8]\n"; |
| 17 | |
| 18 | # Script to tidy up the output of w3m when it makes a text file. First we |
| 19 | # convert sequences of blank lines into a single blank line, to get everything |
| 20 | # uniform. Then we go through and insert blank lines before chapter and |
| 21 | # sections, also converting chapter titles to uppercase. |
| 22 | |
| 23 | # We also have to do some character translation in the first pass. It seems |
| 24 | # that xmlto now generates Unicode in its HTML pages. This gives three problems: |
| 25 | # (1) It inserts the byte sequence C2 A0 (U+00A0) as a fixed-width space; |
| 26 | # (2) It uses U+25CF as its bullet character. |
| 27 | # (3) It inserts a whole slew of "box drawing" characters round the heading. |
| 28 | |
| 29 | my @lines = <>; |
| 30 | my $lastwasblank = 0; |
| 31 | |
| 32 | foreach my $line (@lines) |
| 33 | { |
| 34 | # (1) non-break space -> normal space |
| 35 | $line =~ s/\x{c2}\x{a0}/ /g; |
| 36 | |
| 37 | unless ($want_utf8) |
| 38 | { |
| 39 | # (2) bullet -> asterisk |
| 40 | $line =~ s/\x{e2}\x{97}\x{8f}/*/g; |
| 41 | $line =~ s/\x{e2}\x{80}\x{a2}/*/g; # OpenSUSE |
| 42 | $line =~ s/\x{e2}\x{96}\x{a1}/*/g; # OpenSUSE |
| 43 | # (3a) horizontal box drawing -> hyphen |
| 44 | $line =~ s/\x{e2}\x{94}[\x{80}\x{81}\x{84}\x{85}\x{88}\x{89}]/-/g; |
| 45 | $line =~ s/\x{e2}\x{95}[\x{8c}\x{8d}\x{90}]/-/g; |
| 46 | $line =~ s/\x{e2}\x{95}[\x{b4}\x{b6}\x{b8}\x{ba}\x{bc}\x{be}]/-/g; |
| 47 | # (3b) vertical box drawing -> bar |
| 48 | $line =~ s/\x{e2}\x{94}[\x{82}\x{83}\x{86}\x{87}\x{8a}\x{8b}]/|/g; |
| 49 | $line =~ s/\x{e2}\x{95}[\x{8e}\x{8f}\x{91}]/|/g; |
| 50 | $line =~ s/\x{e2}\x{95}[\x{b5}\x{b7}\x{b9}\x{bb}\x{bd}\x{bf}]/|/g; |
| 51 | # (3c) corner box drawing -> plus |
| 52 | $line =~ s/\x{e2}\x{94}[\x{8c}-\x{bf}]/+/g; |
| 53 | $line =~ s/\x{e2}\x{95}[\x{80}-\x{8b}\x{92}-\x{b0}]/+/g; |
| 54 | # other |
| 55 | $line =~ s/\x{e2}\x{95}\x{b1}/\//g; |
| 56 | $line =~ s/\x{e2}\x{95}\x{b2}/\\/g; |
| 57 | $line =~ s/\x{e2}\x{95}\x{b3}/X/g; |
| 58 | } |
| 59 | |
| 60 | # w3m rendering issue apparently only seen by pdp |
| 61 | # affects section numbers after the ToC, some info on spool-file -lines, etc |
| 62 | # always appears to be a spurious extra character, safely just dropped. |
| 63 | $line =~ s/\x{c2}//g; |
| 64 | |
| 65 | if ($line =~ /^\s*$/) |
| 66 | { |
| 67 | $line = "" if $lastwasblank; |
| 68 | $lastwasblank = 1; |
| 69 | next; |
| 70 | } |
| 71 | $lastwasblank = 0; |
| 72 | } |
| 73 | |
| 74 | # Find start of TOC, uppercasing its title |
| 75 | |
| 76 | my $i = 0; |
| 77 | for ($i = 0; $i < scalar @lines; $i++) |
| 78 | { |
| 79 | $lines[$i] = "TABLE OF CONTENTS\n" if $lines[$i] =~ /^Table of Contents/; |
| 80 | last if $lines[$i] =~ /^1\. /; |
| 81 | } |
| 82 | |
| 83 | # Find start of first chapter |
| 84 | |
| 85 | for ($i++; $i < scalar @lines; $i++) |
| 86 | { last if $lines[$i] =~ /^1\. /; } |
| 87 | |
| 88 | # Process the body. We can detect the starts of chapters and sections by |
| 89 | # looking for preceding and following blank lines, and then matching against |
| 90 | # the numbers. |
| 91 | |
| 92 | my $chapter = 0; |
| 93 | my $section; |
| 94 | for (; $i < scalar @lines; $i++) |
| 95 | { |
| 96 | next if $lines[$i-1] !~ /^$/ || $lines[$i+1] !~ /^$/; |
| 97 | |
| 98 | # Start of chapter |
| 99 | |
| 100 | if ($lines[$i] =~ /^(\d+)\. / && $1 == $chapter + 1) |
| 101 | { |
| 102 | $chapter++; |
| 103 | $section = 0; |
| 104 | $lines[$i] = "\n\n" . ("=" x 79) . "\n" . uc($lines[$i]); |
| 105 | } |
| 106 | |
| 107 | # Start of next section |
| 108 | |
| 109 | elsif ($lines[$i] =~ /^(\d+)\.(\d+) / && $1 == $chapter && $2 == $section + 1) |
| 110 | { |
| 111 | $section++; |
| 112 | $lines[$i] = "\n$lines[$i]" . "-" x (length($lines[$i]) - 1) . "\n"; |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | print @lines; |
| 117 | |
| 118 | # End |