doc/doc-docbook/Tidytxt

   1 #! /usr/bin/perl
   2
   3 # Script to tidy up the output of w3m when it makes a text file. First we
   4 # convert sequences of blank lines into a single blank line, to get everything
   5 # uniform. Then we go through and insert blank lines before chapter and
   6 # sections, also converting chapter titles to uppercase.
   7
   8 # We also have to do some character translation in the first pass. It seems
   9 # that xmlto now generates Unicode in its HTML pages. This gives three problems:
  10 # (1) It inserts the byte sequence C2 A0 (U+00A0) as a fixed-width space;
  11 # (2) It uses U+25CF as its bullet character.
  12 # (3) It inserts a whole slew of "box drawing" characters round the heading.
  13
  14 @lines = <>;
  15
  16 $lastwasblank = 0;
  17 foreach $line (@lines)
  18   {
  19   # (1) non-break space -> normal space
  20   $line =~ s/\x{c2}\x{a0}/ /g;
  21   # (2) bullet -> asterisk
  22   $line =~ s/\x{e2}\x{97}\x{8f}/*/g;
  23   $line =~ s/\x{e2}\x{80}\x{a2}/*/g; # OpenSUSE
  24   $line =~ s/\x{e2}\x{96}\x{a1}/*/g; # OpenSUSE
  25   # (3a) horizontal box drawing -> hyphen
  26   $line =~ s/\x{e2}\x{94}[\x{80}\x{81}\x{84}\x{85}\x{88}\x{89}]/-/g;
  27   $line =~ s/\x{e2}\x{95}[\x{8c}\x{8d}\x{90}]/-/g;
  28   $line =~ s/\x{e2}\x{95}[\x{b4}\x{b6}\x{b8}\x{ba}\x{bc}\x{be}]/-/g;
  29   # (3b) vertical box drawing -> bar
  30   $line =~ s/\x{e2}\x{94}[\x{82}\x{83}\x{86}\x{87}\x{8a}\x{8b}]/|/g;
  31   $line =~ s/\x{e2}\x{95}[\x{8e}\x{8f}\x{91}]/|/g;
  32   $line =~ s/\x{e2}\x{95}[\x{b5}\x{b7}\x{b9}\x{bb}\x{bd}\x{bf}]/|/g;
  33   # (3c) corner box drawing -> plus
  34   $line =~ s/\x{e2}\x{94}[\x{8c}-\x{bf}]/+/g;
  35   $line =~ s/\x{e2}\x{95}[\x{80}-\x{8b}\x{92}-\x{b0}]/+/g;
  36   # other
  37   $line =~ s/\x{e2}\x{95}\x{b1}/\//g;
  38   $line =~ s/\x{e2}\x{95}\x{b2}/\\/g;
  39   $line =~ s/\x{e2}\x{95}\x{b3}/X/g;
  40
  41   # w3m rendering issue apparently only seen by pdp
  42   # affects section numbers after the ToC, some info on spool-file -lines, etc
  43   # always appears to be a spurious extra character, safely just dropped.
  44   $line =~ s/\x{c2}//g;
  45
  46   if ($line =~ /^\s*$/)
  47     {
  48     $line = "" if $lastwasblank;
  49     $lastwasblank = 1;
  50     next;
  51     }
  52   $lastwasblank = 0;
  53   }
  54
  55 # Find start of TOC, uppercasing its title
  56
  57 for ($i = 0; $i < scalar @lines; $i++)
  58   {
  59   $lines[$i] = "TABLE OF CONTENTS\n" if $lines[$i] =~ /^Table of Contents/;
  60   last if $lines[$i] =~ /^1\. /;
  61   }
  62
  63 # Find start of first chapter
  64
  65 for ($i++; $i < scalar @lines; $i++)
  66   { last if $lines[$i] =~ /^1\. /; }
  67
  68 # Process the body. We can detect the starts of chapters and sections by
  69 # looking for preceding and following blank lines, and then matching against
  70 # the numbers.
  71
  72 $chapter = 0;
  73 for (; $i < scalar @lines; $i++)
  74   {
  75   next if $lines[$i-1] !~ /^$/ || $lines[$i+1] !~ /^$/;
  76
  77   # Start of chapter
  78
  79   if ($lines[$i] =~ /^(\d+)\. / && $1 == $chapter + 1)
  80     {
  81     $chapter++;
  82     $section = 0;
  83     $lines[$i] = "\n\n" . ("=" x 79) . "\n" . uc($lines[$i]);
  84     }
  85
  86   # Start of next section
  87
  88   elsif ($lines[$i] =~ /^(\d+)\.(\d+) / && $1 == $chapter && $2 == $section + 1)
  89     {
  90     $section++;
  91     $lines[$i] = "\n$lines[$i]" . "-" x (length($lines[$i]) - 1) . "\n";
  92     }
  93   }
  94
  95 print @lines;
  96
  97 # End