[exim.git] / doc / doc-docbook / TidyHTML-spec

#! /usr/bin/perl

# $Cambridge: exim/doc/doc-docbook/TidyHTML-spec,v 1.4 2006/04/04 14:03:49 ph10 Exp $

# Script to tidy up the spec HTML files that are generated by xmlto. The
# following changes are made:
#
# 1. Tidy the index.html file by splitting the very long lines.
# 2. Create reverse links from chapter and section titles back to the TOC.
# 3. Tidy the ix01.html file - the actual index - by splitting long lines.
# 4. Insert links from the letter divisions to the top of the Index.
# 5. Turn <div class="literallayout"><p> into <div class="literallayout"> and
#    a matching </p></div> into </div> to get rid of unwanted vertical white
#    space.
# 6. Before each occurrence of </td> insert &nbsp; so that the table's cell
#    is a little bit wider than the text itself.

chdir "spec_html";

$tocref = 1;

# Read in the index.html file. It's really the TOC.

open(IN, "index.html") || die "Failed to open index.html for reading: $!\n";
@toc = <IN>;
close(IN);

# Insert a newline after every > except when it is preceded by 'class="quote"',
# because the whole toc is generated as one humungous line that is hard to
# check. We have to avoid it in the quote case because that puts a space into
# the output, and similarly for the </span> the comes afterwards. Easy way out
# is just not to do it for all </span> occurrences. Unfortunately, Perl does
# not implement lookbehinds where the alternatives are of different lengths, so
# we have to take two passes.


foreach $line (@toc)
  {
  $line =~ s/(?<!class="quote")>\s*/>\n/g;
  $line =~ s/<\/span>\n/<\/span>/g;
  }

# Split the lines so that each one is a separate element in the vector.

for ($i = 0; $i < scalar(@toc); $i++)
  { splice @toc, $i, 1, (split /(?<=\n)/, $toc[$i]); }

# We want to create reverse links from each chapter and section title back to
# the relevant place in the TOC. Scan the TOC for the relevant entries. Add
# an id to each entry, and create tables that remember the file names and the
# new link ids.

foreach $line (@toc)
  {
  if ($line =~ /^<a href="((?:ch|ix)\d+\.html)(#[^"]+)?">/)
    {
    my($chix) = $1;
    my($ss) = $2;
    my($id) = sprintf "%04d", $tocref++;
    $line =~ s/<a/<a id="toc$id"/;
    $backref{"$chix$ss"} = "toc$id";
    push @chlist, $chix;
    }
  }

# Write out the modified index.html file.

open (OUT, ">index.html") || die "Failed to open index.html for writing: $!\n";
print OUT @toc;
close(OUT);

# Now scan each of the other page files and insert the reverse links. While
# we are at it, we tidy up <div class="literallayout"> by removing unwanted
# paragraph marks, which generate unwanted vertical space. We also insert
# &nbsp; before </td> to push table cells apart from each other.

foreach $file (@chlist)
  {
  open(IN, "$file") || die "Failed to open $file for reading: $!\n";
  @text = <IN>;
  close(IN);

  # Insert a newline after certain elements, and split the lines so that each
  # one is a separate element in the vector. This makes it easier to recognize
  # these elements.

  foreach $line (@text)
    {
    $line =~ s/<p>\s*(?!\n)/<p>\n/g;
    $line =~ s/<\/p>\s*(?!\n)/<\/p>\n/g;
    $line =~ s/<\/div>\s*(?!\n)/<\/div>\n/g;
    $line =~ s/<div([^>]*)>(?!\n)/<div$1>\n/g;
    }

  for ($i = 0; $i < scalar(@text); $i++)
    { splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); }

  $thisdiv = 0;

  for ($i = 0; $i < scalar(@text); $i++)
    {
    if ($text[$i] =~ /^(.*?)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.+?)<\/h(.*)$/)
      {
      my($pre, $opt, $id, $title, $post) = ($1, $2, $3, $4, $5);

      # Section reference
      my($ref) = $backref{"$file#$id"};

      # If not found, try for a chapter reference
      $ref = $backref{"$file"} if !defined $ref;

      # Adjust the line
      $text[$i]= "$pre<a$opt href=\"index.html#$ref\" id=\"$id\">$title</a></h$post";
      }

    elsif ($text[$i] =~ /^<div [^>]*?class="literallayout">$/ && $text[$i+1] eq "<p>\n")
      {
      $text[++$i] = "";
      $thisdiv = 1;
      }
    elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n")
      {
      $text[$i] = "";
      $thisdiv = 0;
      }
    elsif ($text[$i] =~ /^\s*<\/td>/)
      {
      $text[$i] = "&nbsp;$text[$i]";
      }
    }

  open(OUT, ">$file") || die "Failed to open $file for writing: $!\n";
  print OUT @text;
  close(OUT);
  }

# Now process the ix01.html file

open(IN, "ix01.html") || die "Failed to open ix01.html for reading: $!\n";
@index = <IN>;
close(IN);

# Insert a newline after every > because the whole index is generated as one
# humungous line that is hard to check. Then split the lines so that each one
# is a separate element in the vector.

foreach $line (@index) { $line =~ s/>\s*/>\n/g; }
for ($i = 0; $i < scalar(@index); $i++)
  { splice @index, $i, 1, (split /(?<=\n)/, $index[$i]); }

# We want to add a list of letters at the top of the index, and link back
# to them from each letter heading. First find the index title and remember
# where to insert the list of letters.

for ($i = 0; $i < scalar(@index); $i++)
  {
  if ($index[$i] =~ /^<\/h2>$/)
    {
    $listindex = $i;
    last;
    }
  }

# Now scan through for the letter headings and build the cross references,
# while also building up the list to insert.

$list = "<h4>\n";
for (; $i < scalar(@index); $i++)
  {
  if ($index[$i] =~ /^(.)<\/h3>$/)
    {
    $letter = $1;
    $index[$i-1] =~ s/^/<a id="${letter}B" href="#${letter}T">/;
    $index[$i] =~ s/$/<\/a>/;
    $list .= "<a id=\"${letter}T\" href=\"#${letter}B\"> $letter</a>\n";
    }
  }

# Now we know which letters we have, we can insert the list.

$list .= "</h4>\n";
splice @index, $listindex, 0, $list;

# Write out the modified index.html file.

open (OUT, ">ix01.html") || die "Failed to open ix01.html for writing: $!\n";
print OUT @index;
close(OUT);


# End
Commit	Line	Data
168e428f PH	1	#! /usr/bin/perl
168e428f PH	2
4f578862	3	# $Cambridge: exim/doc/doc-docbook/TidyHTML-spec,v 1.4 2006/04/04 14:03:49 ph10 Exp $
168e428f PH	4
	5	# Script to tidy up the spec HTML files that are generated by xmlto. The
	6	# following changes are made:
	7	#
	8	# 1. Tidy the index.html file by splitting the very long lines.
	9	# 2. Create reverse links from chapter and section titles back to the TOC.
	10	# 3. Tidy the ix01.html file - the actual index - by splitting long lines.
	11	# 4. Insert links from the letter divisions to the top of the Index.
068aaea8 PH	12	# 5. Turn <div class="literallayout"><p> into <div class="literallayout"> and
	13	# a matching </p></div> into </div> to get rid of unwanted vertical white
	14	# space.
	15	# 6. Before each occurrence of </td> insert   so that the table's cell
	16	# is a little bit wider than the text itself.
168e428f	17
4f578862	18	chdir "spec_html";
168e428f PH	19
	20	$tocref = 1;
	21
	22	# Read in the index.html file. It's really the TOC.
	23
	24	open(IN, "index.html") \|\| die "Failed to open index.html for reading: $!\n";
	25	@toc = <IN>;
	26	close(IN);
	27
4f578862 PH	28	# Insert a newline after every > except when it is preceded by 'class="quote"',
	29	# because the whole toc is generated as one humungous line that is hard to
	30	# check. We have to avoid it in the quote case because that puts a space into
	31	# the output, and similarly for the </span> the comes afterwards. Easy way out
	32	# is just not to do it for all </span> occurrences. Unfortunately, Perl does
	33	# not implement lookbehinds where the alternatives are of different lengths, so
	34	# we have to take two passes.
	35
	36
	37	foreach $line (@toc)
	38	{
	39	$line =~ s/(?<!class="quote")>\s*/>\n/g;
	40	$line =~ s/<\/span>\n/<\/span>/g;
	41	}
	42
	43	# Split the lines so that each one is a separate element in the vector.
168e428f	44
168e428f PH	45	for ($i = 0; $i < scalar(@toc); $i++)
	46	{ splice @toc, $i, 1, (split /(?<=\n)/, $toc[$i]); }
	47
	48	# We want to create reverse links from each chapter and section title back to
	49	# the relevant place in the TOC. Scan the TOC for the relevant entries. Add
	50	# an id to each entry, and create tables that remember the file names and the
	51	# new link ids.
	52
	53	foreach $line (@toc)
	54	{
	55	if ($line =~ /^<a href="((?:ch\|ix)\d+\.html)(#[^"]+)?">/)
	56	{
	57	my($chix) = $1;
	58	my($ss) = $2;
	59	my($id) = sprintf "%04d", $tocref++;
	60	$line =~ s/<a/<a id="toc$id"/;
	61	$backref{"$chix$ss"} = "toc$id";
	62	push @chlist, $chix;
	63	}
	64	}
	65
	66	# Write out the modified index.html file.
	67
	68	open (OUT, ">index.html") \|\| die "Failed to open index.html for writing: $!\n";
	69	print OUT @toc;
	70	close(OUT);
	71
068aaea8 PH	72	# Now scan each of the other page files and insert the reverse links. While
	73	# we are at it, we tidy up <div class="literallayout"> by removing unwanted
	74	# paragraph marks, which generate unwanted vertical space. We also insert
	75	#   before </td> to push table cells apart from each other.
168e428f PH	76
	77	foreach $file (@chlist)
	78	{
	79	open(IN, "$file") \|\| die "Failed to open $file for reading: $!\n";
	80	@text = <IN>;
	81	close(IN);
	82
068aaea8 PH	83	# Insert a newline after certain elements, and split the lines so that each
	84	# one is a separate element in the vector. This makes it easier to recognize
	85	# these elements.
	86
168e428f PH	87	foreach $line (@text)
168e428f PH	88	{
068aaea8 PH	89	$line =~ s/<p>\s*(?!\n)/<p>\n/g;
	90	$line =~ s/<\/p>\s*(?!\n)/<\/p>\n/g;
	91	$line =~ s/<\/div>\s*(?!\n)/<\/div>\n/g;
	92	$line =~ s/<div([^>]*)>(?!\n)/<div$1>\n/g;
	93	}
	94
	95	for ($i = 0; $i < scalar(@text); $i++)
	96	{ splice @text, $i, 1, (split /(?<=\n)/, $text[$i]); }
	97
	98	$thisdiv = 0;
	99
	100	for ($i = 0; $i < scalar(@text); $i++)
	101	{
	102	if ($text[$i] =~ /^(.?)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.+?)<\/h(.)$/)
168e428f PH	103	{
	104	my($pre, $opt, $id, $title, $post) = ($1, $2, $3, $4, $5);
	105
	106	# Section reference
	107	my($ref) = $backref{"$file#$id"};
	108
	109	# If not found, try for a chapter reference
	110	$ref = $backref{"$file"} if !defined $ref;
	111
	112	# Adjust the line
068aaea8 PH	113	$text[$i]= "$pre<a$opt href=\"index.html#$ref\" id=\"$id\">$title</a></h$post";
	114	}
	115
9b371988	116	elsif ($text[$i] =~ /^<div [^>]*?class="literallayout">$/ && $text[$i+1] eq "<p>\n")
068aaea8 PH	117	{
	118	$text[++$i] = "";
	119	$thisdiv = 1;
	120	}
	121	elsif ($thisdiv && $text[$i] eq "</p>\n" && $text[$i+1] eq "</div>\n")
	122	{
	123	$text[$i] = "";
	124	$thisdiv = 0;
	125	}
	126	elsif ($text[$i] =~ /^\s*<\/td>/)
	127	{
	128	$text[$i] = " $text[$i]";
168e428f PH	129	}
	130	}
	131
	132	open(OUT, ">$file") \|\| die "Failed to open $file for writing: $!\n";
	133	print OUT @text;
	134	close(OUT);
	135	}
	136
	137	# Now process the ix01.html file
	138
	139	open(IN, "ix01.html") \|\| die "Failed to open ix01.html for reading: $!\n";
	140	@index = <IN>;
	141	close(IN);
	142
	143	# Insert a newline after every > because the whole index is generated as one
	144	# humungous line that is hard to check. Then split the lines so that each one
	145	# is a separate element in the vector.
	146
	147	foreach $line (@index) { $line =~ s/>\s*/>\n/g; }
	148	for ($i = 0; $i < scalar(@index); $i++)
	149	{ splice @index, $i, 1, (split /(?<=\n)/, $index[$i]); }
	150
	151	# We want to add a list of letters at the top of the index, and link back
	152	# to them from each letter heading. First find the index title and remember
	153	# where to insert the list of letters.
	154
	155	for ($i = 0; $i < scalar(@index); $i++)
	156	{
	157	if ($index[$i] =~ /^<\/h2>$/)
	158	{
	159	$listindex = $i;
	160	last;
	161	}
	162	}
	163
	164	# Now scan through for the letter headings and build the cross references,
	165	# while also building up the list to insert.
	166
	167	$list = "<h4>\n";
	168	for (; $i < scalar(@index); $i++)
	169	{
	170	if ($index[$i] =~ /^(.)<\/h3>$/)
	171	{
	172	$letter = $1;
	173	$index[$i-1] =~ s/^/<a id="${letter}B" href="#${letter}T">/;
	174	$index[$i] =~ s/$/<\/a>/;
	175	$list .= "<a id=\"${letter}T\" href=\"#${letter}B\"> $letter</a>\n";
	176	}
	177	}
	178
	179	# Now we know which letters we have, we can insert the list.
	180
	181	$list .= "</h4>\n";
	182	splice @index, $listindex, 0, $list;
	183
	184	# Write out the modified index.html file.
	185
	186	open (OUT, ">ix01.html") \|\| die "Failed to open ix01.html for writing: $!\n";
	187	print OUT @index;
	188	close(OUT);
	189
	190
	191	# End