[exim.git] / doc / doc-scripts / DoIndex

#! /usr/bin/perl -w
# $Cambridge: exim/doc/doc-scripts/DoIndex,v 1.1 2004/10/07 15:04:35 ph10 Exp $

# Script for producing the Index for the Exim manual from the output of the
# SGCAL run. This is copied from the script for the Exim book.


##############################################################################
# Patterns for matching things to be removed from the sort keys

# This was copied from the Exim book processor, but we have now found a
# better way of doing this. Leave the code until I am quite sure...

# $pat[0]  = qr/ \(\\\*see also\*\\[^)]+\)/;
# $pat[1]  = qr/(?<!@)\/\//;                     # //
# $pat[2]  = qr/(?<!@)\/\\/;                     # /\
# $pat[3]  = qr/(?<!@)\\\//;                     # \/
# $pat[4]  = qr/(?<!@) \\                        # non-@ \, followed by one of
#                             (?:
#                             [\.\/] |           # dot or slash
#                             !- |               # !-
#                             !\+ |              # !+
#                             !\. |              # !.
#                             "\+ |              # "+
#                             \([.\/]? |         # ( and optional . or slash
#                             [[\$\\%?!-"] |     # [ $ \ % ! " or -
#                             \*{1,2} |          # * or **
#                             \^{1,2}\/?         # ^ or ^^ and optional slash
#                             )/x;
# $pat[5]  = qr/(?: []\$\\%)?!"] |               # ] $ \ % ) ? " or ! )
#                   \*{1,2}  |                   # * or **            ) optional
#                   \^{1,2})?                    # ^ or ^^            )
#                   \\/x;                        # then \
# $pat[6]  = qr/(?<!@)::/;
# $pat[7]  = qr/\sR[FS]\b/;
# $pat[8]  = qr/``/;
# $pat[9]  = qr/''/;
# $pat[10] = qr/`/;
# $pat[11] = qr/'/;
# $pat[12] = qr/,/;
# $pat[13] = qr/\(e?s\)/;


# Other patterns

# $keysplit = qr/^(.*?)(\|\|.*?)?\s(R[AZ])?\s?(\d+)$/;

$keysplit = qr/^(.*?)(\@\|\@\|.*?)?\s(R[AZ])?\s?(\d+)$/;


# The sort function

sub cf {
my($x,$y) = ($a,$b);

############old#############
#foreach $pattern (@pat)    # Remove strings by pattern
#  {
#  $x =~ s/$pattern//g;
#  $y =~ s/$pattern//g; 
#  } 
##########################


# Turn || into @|@|

$x =~ s/\|\|/@|@|/g;
$y =~ s/\|\|/@|@|/g;

# Remove all special characters, except those preceded by @

$x =~ s/(?<!\@)[^\w\@\s]//g;
$y =~ s/(?<!\@)[^\w\@\s]//g;

# Remove the escaping @s

#$x =~ s/\@(.)/$1/g;
#$y =~ s/\@(.)/$1/g;


################old ########################
#$x =~ s/:(\w+):/$1/g;      # :fail: etc => fail
#$y =~ s/:(\w+):/$1/g;

#$x =~ s/^\@[^a-z]+/\@/i;   # Make keys starting with @ 
#$y =~ s/^\@[^a-z]+/\@/i;   # sort on @ followed by the first letter
##############################################3


$x =~ s/\@_/\x7f/g;        # Make underscore sort late (option names)
$y =~ s/\@_/\x7f/g; 
   
# Split up to sort on individual parts

my($xp,$xs,$xr,$xn) = $x =~ /$keysplit/;
my($yp,$ys,$yr,$yn) = $y =~ /$keysplit/;

$xr = "" if !defined $xr;
$yr = "" if !defined $yr;

$xs = "" if !defined $xs;
$ys = "" if !defined $ys;

if ($show_keys)
  {
  print "a=$a\n  x=$x\n  xp=$xp\n  xs=$xs\n  xr=$xr\n  xn=$xn\n";
  print "b=$b\n  y=$y\n  yp=$yp\n  ys=$ys\n  yr=$yr\n  yn=$yn\n";
  } 

my ($c) = "\L$xp" cmp "\L$yp";        # Caseless, primary text only
$c = $xp cmp $yp if $c == 0;          # Caseful, primary text only
$c = "\L$xs" cmp "\L$ys" if $c == 0;  # Caseless, secondary text only
$c = $xs cmp $ys if $c == 0;          # Caseful, secondary text only
$c = $xn <=> $yn if $c == 0;          # Compare the numbers
$c = $xr cmp $yr if $c == 0;          # Sort RA before RZ
return $c;
}


##############################################################################
# Function for getting the next line from the @lines vector, using the global
# index $1. If the next pair of lines specifies a range of pages, combine them.
# That's why $linenumber has to be global - so we can increment it. If there's
# a range error, return "".

sub getnextentry {
my($line) = $lines[$linenumber];
my($aa,$zz,$tline,$nextline,$tnextline);

if ($line =~ / RA (\d+)/)
  {
  $aa = $1; 
  $nextline = $lines[++$linenumber];
  if ($nextline =~ / RZ (\d+)/) 
    { 
    $zz = $1;
    }
  else    
    {
    print STDERR "** Bad range data (1)\n";
    print STDERR "   $line\n";
    print STDERR "   $nextline\n";
    return "";
    }  
    
  $tline = $line;
  $tnextline = $nextline; 
   
  $tline =~ s/ RA \d+//; 
  $tnextline =~ s/ RZ \d+//;
  
  if ($tline ne $tnextline)
    {
    print STDERR "** Bad range data (2)\n";
    print STDERR "   $line\n";
    print STDERR "   $nextline\n";
    return "";
    }  

  $line = ($aa eq $zz)? "$tline $aa" : "$tline $aa--$zz";
  }   
  
elsif ($line =~ / RZ (\d+)/)
  {
  print STDERR "** Bad range data (RZ without RA)\n";
  print STDERR "   $line\n";
  return "";
  } 

return $line
}


##############################################################################
# Function for outputting a line, checking for the current primary
# and indenting a bit for secondaries. We also need a newpar
# before each item, because the main indent is set to a largish indent
# for long reference lists, but the parindent is set to counter this.
# This is where we handle the break between letters. We know that any non-
# alphamerics at the start of lines are markup, except for @. A reference
# value of 99999 is for the "see also" lines. Suppress it.

sub outline {
my($text,$ref) = ($_[0],$_[1]);
my ($letter) = $text =~ /^[^A-Za-z0-9\@]*(.)/;

return if $text =~ /^\s*$/;

if ($ref eq "99999")    # dummy for see also
  {
  $ref = "" 
  } 
else
  {
  $ref = "#$ref";       # prepend space
  }    

if ($letter =~ /\d/) { $letter = "0"; } else { $letter = "\U$letter"; } 

print OUT ".newpar\n";

if ($letter ne $currentletter && $letter ge "A")
  {
  print OUT ".newletter\n"; 
  $currentletter = $letter;   
  } 
    
$text =~ s/\@'/\$'/g;   # Turns @' into $' so that it prints a non-curly quote

if ($text =~ /^(.+)\|\|(.*)$/)
  {
  my($primary,$secondary) = ($1,$2);
 
  if ($primary ne $lastprimary)
    {
    print OUT ".primary $primary\n"; 
    $lastprimary = $primary;
    }
 
  $primary =~ s/"/""/g;
  $secondary =~ s/"/""/g;   
   
  my($contprim) = $primary;
  $contprim =~ s/ \(\\\*see also\*\\[^)]+\)//; 

  print OUT ".secondary \"$primary\" \"$secondary$ref\" \"$contprim\"\n";
  } 

# Not a two-part item; insert @ if the first char is a dot

else
  {
  print OUT "@" if $text =~ /^\./; 
  print OUT "$text$ref\n";
  $lastprimary = $text; 
  } 
}


##############################################################################
# The main script

$save_sorted = 0;
$test_index = 0;
$show_keys = 0;

while (@ARGV > 0)
  {
  my($arg) = shift @ARGV;
  if    ($arg eq "-k") { $show_keys = 1; }
  elsif ($arg eq "-s") { $save_sorted = 1; }
  elsif ($arg eq "-t") { $test_index = $save_sorted = 1; }
  else  { die "Unknown option $arg\n"; }  
  } 

if ($test_index)
  {
  open(IN, "z-testindex") || die "Can't open z-testindex\n";
  }
else
  {   
  open(IN, "z-rawindex") || die "Can't open z-rawindex\n";
  }

open(OUT, ">z-index")  || die "Can't open z-index\n";

# Extract index lines ($e lines are contents). Until we hit the first
# $e line, we are dealing with "see also" index lines, for which we want
# to turn the line number into 99999.

$#lines = -1;
$prestuff = 1;

while (<IN>)
  {
  s/\n$//; 
  if (/\$e/)
    {
    $prestuff = 0; 
    }
  else
    {
    s/(\D)$/$1 99999/ if $prestuff;          # No number in "see also"
    push(@lines, $_);
    } 
  $index_pagenumber = $1 if /^Index\$e(\d+)/;
  } 
close(IN);

# Sort, ignoring markup

print STDERR "Sorting ...\n";
@lines = sort cf @lines;

# Keep a copy of the sorted data, for reference

if ($save_sorted)
  {
  open(X, ">z-indexsorted") || die "Can't open z-indexsorted\n";
  foreach $line (@lines)
   {
   print X "$line\n";
   }
  close(X);     
  } 

# Heading for the index file

print OUT <<"EOF";
.library "a4ps"
.linelength ~~sys.linelength + 16.0

.include "markup.sg"

.indent 3em
.parspace 0
.parindent -3em
.justify left
.
.foot
\$c [~~sys.pagenumber]
.endfoot
.
.cancelflag #
.flag # "\$S*1"
.set INDEX true
.
.macro primary "text"
.if ~~sys.leftonpage < 2ld
.newcolumn
.fi
~~1
.newpar
.endm
.
.macro secondary "prim" "sec" "contprim"
.if ~~sys.leftonpage < 1ld
.newcolumn
.newpar
~~3 \$it\{(continued)\}
.newpar
.fi
##~~2
.endm
.
.macro newletter
.if ~~sys.leftonpage < 4ld
.newcolumn
.else
.space 1ld
.fi
.newpar
.endm
.
.set chapter -1
.page $index_pagenumber
.chapter Index
.columns 2
.newpar
.
EOF

# Process the lines and output the result.
# Note that $linenumber is global, and is changed by getnextentry() for
# pairs of lines that represent ranges.

$lastprimary = "";
$lastref = "";
$currenttext = $currentref = "";
$currentletter = "";
$badrange = 0;

print STDERR "Processing ...\n";

for ($linenumber = 0; $linenumber < @lines; $linenumber++) 
  { 
  $line = &getnextentry();
  
  if ($line eq "")   # Bad range data - but carry on to get all of it
    {
    $badrange = 1;
    next;
    }   
    
  # Split off the text and reference
  
  ($text,$ref) = $line =~ /^(.*)\s+([\d-]+)$/;

  # If same as current text, just add the new reference, unless its a duplicate

  if ($text eq $currenttext)
    {
    if ($ref ne $lastref)
      {  
      $currentref .= ", $ref"; 
      $lastref = $ref;
      }  
    next;
    }
    
  # Not the same as the current text. Output the current text, then 
  # set up a new current. 
    
  &outline($currenttext, $currentref);
   
  $currenttext = $text; 
  $currentref = $lastref = $ref; 
  }
  
# Output the final line and close the file

&outline($currenttext, $currentref);
close(OUT);

die "** Aborted\n" if $badrange;

# Format the index

system("sgcal z-index -to zi-gcode -index /dev/null");
system("sgtops zi-gcode -to zi-ps");
print "PostScript in zi-ps\n";

# End
Commit	Line	Data
495ae4b0 PH	1	#! /usr/bin/perl -w
	2	# $Cambridge: exim/doc/doc-scripts/DoIndex,v 1.1 2004/10/07 15:04:35 ph10 Exp $
	3
	4	# Script for producing the Index for the Exim manual from the output of the
	5	# SGCAL run. This is copied from the script for the Exim book.
	6
	7
	8	##############################################################################
	9	# Patterns for matching things to be removed from the sort keys
	10
	11	# This was copied from the Exim book processor, but we have now found a
	12	# better way of doing this. Leave the code until I am quite sure...
	13
	14	# $pat[0] = qr/ \(\\\see also\\\[^)]+\)/;
	15	# $pat[1] = qr/(?<!@)\/\//; # //
	16	# $pat[2] = qr/(?<!@)\/\\/; # /\
	17	# $pat[3] = qr/(?<!@)\\\//; # \/
	18	# $pat[4] = qr/(?<!@) \\ # non-@ \, followed by one of
	19	# (?:
	20	# [\.\/] \| # dot or slash
	21	# !- \| # !-
	22	# !\+ \| # !+
	23	# !\. \| # !.
	24	# "\+ \| # "+
	25	# \([.\/]? \| # ( and optional . or slash
	26	# [[\$\\%?!-"] \| # [ $ \ % ! " or -
	27	# \{1,2} \| # or **
	28	# \^{1,2}\/? # ^ or ^^ and optional slash
	29	# )/x;
	30	# $pat[5] = qr/(?: []\$\\%)?!"] \| # ] $ \ % ) ? " or ! )
	31	# \{1,2} \| # or ** ) optional
	32	# \^{1,2})? # ^ or ^^ )
	33	# \\/x; # then \
	34	# $pat[6] = qr/(?<!@)::/;
	35	# $pat[7] = qr/\sR[FS]\b/;
	36	# $pat[8] = qr/``/;
	37	# $pat[9] = qr/''/;
	38	# $pat[10] = qr/`/;
	39	# $pat[11] = qr/'/;
	40	# $pat[12] = qr/,/;
	41	# $pat[13] = qr/\(e?s\)/;
	42
	43
	44	# Other patterns
	45
	46	# $keysplit = qr/^(.?)(\\|\\|.?)?\s(R[AZ])?\s?(\d+)$/;
	47
	48	$keysplit = qr/^(.?)(\@\\|\@\\|.?)?\s(R[AZ])?\s?(\d+)$/;
	49
	50
	51	# The sort function
	52
	53	sub cf {
	54	my($x,$y) = ($a,$b);
	55
	56	############old#############
	57	#foreach $pattern (@pat) # Remove strings by pattern
	58	# {
	59	# $x =~ s/$pattern//g;
	60	# $y =~ s/$pattern//g;
	61	# }
	62	##########################
	63
	64
65	# Turn \|\| into @\|@\|
66
67	$x =~ s/\\|\\|/@\|@\|/g;
68	$y =~ s/\\|\\|/@\|@\|/g;
69
70	# Remove all special characters, except those preceded by @
71
72	$x =~ s/(?<!\@)[^\w\@\s]//g;
73	$y =~ s/(?<!\@)[^\w\@\s]//g;
74
75	# Remove the escaping @s
76
77	#$x =~ s/\@(.)/$1/g;
78	#$y =~ s/\@(.)/$1/g;
79
80
81
82	################old ########################
83	#$x =~ s/:(\w+):/$1/g; # :fail: etc => fail
84	#$y =~ s/:(\w+):/$1/g;
85
86	#$x =~ s/^\@[^a-z]+/\@/i; # Make keys starting with @
87	#$y =~ s/^\@[^a-z]+/\@/i; # sort on @ followed by the first letter
88	##############################################3
89
90
91	$x =~ s/\@_/\x7f/g; # Make underscore sort late (option names)
92	$y =~ s/\@_/\x7f/g;
93
94	# Split up to sort on individual parts
95
96	my($xp,$xs,$xr,$xn) = $x =~ /$keysplit/;
97	my($yp,$ys,$yr,$yn) = $y =~ /$keysplit/;
98
99	$xr = "" if !defined $xr;
100	$yr = "" if !defined $yr;
101
102	$xs = "" if !defined $xs;
103	$ys = "" if !defined $ys;
104
105	if ($show_keys)
106	{
107	print "a=$a\n x=$x\n xp=$xp\n xs=$xs\n xr=$xr\n xn=$xn\n";
108	print "b=$b\n y=$y\n yp=$yp\n ys=$ys\n yr=$yr\n yn=$yn\n";
109	}
110
111	my ($c) = "\L$xp" cmp "\L$yp"; # Caseless, primary text only
112	$c = $xp cmp $yp if $c == 0; # Caseful, primary text only
113	$c = "\L$xs" cmp "\L$ys" if $c == 0; # Caseless, secondary text only
114	$c = $xs cmp $ys if $c == 0; # Caseful, secondary text only
115	$c = $xn <=> $yn if $c == 0; # Compare the numbers
116	$c = $xr cmp $yr if $c == 0; # Sort RA before RZ
117	return $c;
118	}
119
120
121
122	##############################################################################
123	# Function for getting the next line from the @lines vector, using the global
124	# index $1. If the next pair of lines specifies a range of pages, combine them.
125	# That's why $linenumber has to be global - so we can increment it. If there's
126	# a range error, return "".
127
128	sub getnextentry {
129	my($line) = $lines[$linenumber];
130	my($aa,$zz,$tline,$nextline,$tnextline);
131
132	if ($line =~ / RA (\d+)/)
133	{
134	$aa = $1;
135	$nextline = $lines[++$linenumber];
136	if ($nextline =~ / RZ (\d+)/)
137	{
138	$zz = $1;
139	}
140	else
141	{
142	print STDERR "** Bad range data (1)\n";
143	print STDERR " $line\n";
144	print STDERR " $nextline\n";
145	return "";
146	}
147
148	$tline = $line;
149	$tnextline = $nextline;
150
151	$tline =~ s/ RA \d+//;
152	$tnextline =~ s/ RZ \d+//;
153
154	if ($tline ne $tnextline)
155	{
156	print STDERR "** Bad range data (2)\n";
157	print STDERR " $line\n";
158	print STDERR " $nextline\n";
159	return "";
160	}
161
162	$line = ($aa eq $zz)? "$tline $aa" : "$tline $aa--$zz";
163	}
164
165	elsif ($line =~ / RZ (\d+)/)
166	{
167	print STDERR "** Bad range data (RZ without RA)\n";
168	print STDERR " $line\n";
169	return "";
170	}
171
172	return $line
173	}
174
175
176
177
178	##############################################################################
179	# Function for outputting a line, checking for the current primary
180	# and indenting a bit for secondaries. We also need a newpar
181	# before each item, because the main indent is set to a largish indent
182	# for long reference lists, but the parindent is set to counter this.
183	# This is where we handle the break between letters. We know that any non-
184	# alphamerics at the start of lines are markup, except for @. A reference
185	# value of 99999 is for the "see also" lines. Suppress it.
186
187	sub outline {
188	my($text,$ref) = ($_[0],$_[1]);
189	my ($letter) = $text =~ /^[^A-Za-z0-9\@]*(.)/;
190
191	return if $text =~ /^\s*$/;
192
193	if ($ref eq "99999") # dummy for see also
194	{
195	$ref = ""
196	}
197	else
198	{
199	$ref = "#$ref"; # prepend space
200	}
201
202	if ($letter =~ /\d/) { $letter = "0"; } else { $letter = "\U$letter"; }
203
204	print OUT ".newpar\n";
205
206	if ($letter ne $currentletter && $letter ge "A")
207	{
208	print OUT ".newletter\n";
209	$currentletter = $letter;
210	}
211
212	$text =~ s/\@'/\$'/g; # Turns @' into $' so that it prints a non-curly quote
213
214	if ($text =~ /^(.+)\\|\\|(.*)$/)
215	{
216	my($primary,$secondary) = ($1,$2);
217
218	if ($primary ne $lastprimary)
219	{
220	print OUT ".primary $primary\n";
221	$lastprimary = $primary;
222	}
223
224	$primary =~ s/"/""/g;
225	$secondary =~ s/"/""/g;
226
227	my($contprim) = $primary;
228	$contprim =~ s/ \(\\\see also\\\[^)]+\)//;
229
230	print OUT ".secondary \"$primary\" \"$secondary$ref\" \"$contprim\"\n";
231	}
232
233	# Not a two-part item; insert @ if the first char is a dot
234
235	else
236	{
237	print OUT "@" if $text =~ /^\./;
238	print OUT "$text$ref\n";
239	$lastprimary = $text;
240	}
241	}
242
243
244
245
246
247	##############################################################################
248	# The main script
249
250	$save_sorted = 0;
251	$test_index = 0;
252	$show_keys = 0;
253
254	while (@ARGV > 0)
255	{
256	my($arg) = shift @ARGV;
257	if ($arg eq "-k") { $show_keys = 1; }
258	elsif ($arg eq "-s") { $save_sorted = 1; }
259	elsif ($arg eq "-t") { $test_index = $save_sorted = 1; }
260	else { die "Unknown option $arg\n"; }
261	}
262
263	if ($test_index)
264	{
265	open(IN, "z-testindex") \|\| die "Can't open z-testindex\n";
266	}
267	else
268	{
269	open(IN, "z-rawindex") \|\| die "Can't open z-rawindex\n";
270	}
271
272	open(OUT, ">z-index") \|\| die "Can't open z-index\n";
273
274	# Extract index lines ($e lines are contents). Until we hit the first
275	# $e line, we are dealing with "see also" index lines, for which we want
276	# to turn the line number into 99999.
277
278	$#lines = -1;
279	$prestuff = 1;
280
281	while (<IN>)
282	{
283	s/\n$//;
284	if (/\$e/)
285	{
286	$prestuff = 0;
287	}
288	else
289	{
290	s/(\D)$/$1 99999/ if $prestuff; # No number in "see also"
291	push(@lines, $_);
292	}
293	$index_pagenumber = $1 if /^Index\$e(\d+)/;
294	}
295	close(IN);
296
297	# Sort, ignoring markup
298
299	print STDERR "Sorting ...\n";
300	@lines = sort cf @lines;
301
302	# Keep a copy of the sorted data, for reference
303
304	if ($save_sorted)
305	{
306	open(X, ">z-indexsorted") \|\| die "Can't open z-indexsorted\n";
307	foreach $line (@lines)
308	{
309	print X "$line\n";
310	}
311	close(X);
312	}
313
314	# Heading for the index file
315
316	print OUT <<"EOF";
317	.library "a4ps"
318	.linelength ~~sys.linelength + 16.0
319
320	.include "markup.sg"
321
322	.indent 3em
323	.parspace 0
324	.parindent -3em
325	.justify left
326	.
327	.foot
328	\$c [~~sys.pagenumber]
329	.endfoot
330	.
331	.cancelflag #
332	.flag # "\$S*1"
333	.set INDEX true
334	.
335	.macro primary "text"
336	.if ~~sys.leftonpage < 2ld
337	.newcolumn
338	.fi
339	~~1
340	.newpar
341	.endm
342	.
343	.macro secondary "prim" "sec" "contprim"
344	.if ~~sys.leftonpage < 1ld
345	.newcolumn
346	.newpar
347	~~3 \$it\{(continued)\}
348	.newpar
349	.fi
350	##~~2
351	.endm
352	.
353	.macro newletter
354	.if ~~sys.leftonpage < 4ld
355	.newcolumn
356	.else
357	.space 1ld
358	.fi
359	.newpar
360	.endm
361	.
362	.set chapter -1
363	.page $index_pagenumber
364	.chapter Index
365	.columns 2
366	.newpar
367	.
368	EOF
369
370	# Process the lines and output the result.
371	# Note that $linenumber is global, and is changed by getnextentry() for
372	# pairs of lines that represent ranges.
373
374	$lastprimary = "";
375	$lastref = "";
376	$currenttext = $currentref = "";
377	$currentletter = "";
378	$badrange = 0;
379
380	print STDERR "Processing ...\n";
381
382	for ($linenumber = 0; $linenumber < @lines; $linenumber++)
383	{
384	$line = &getnextentry();
385
386	if ($line eq "") # Bad range data - but carry on to get all of it
387	{
388	$badrange = 1;
389	next;
390	}
391
392	# Split off the text and reference
393
394	($text,$ref) = $line =~ /^(.*)\s+([\d-]+)$/;
395
396	# If same as current text, just add the new reference, unless its a duplicate
397
398	if ($text eq $currenttext)
399	{
400	if ($ref ne $lastref)
401	{
402	$currentref .= ", $ref";
403	$lastref = $ref;
404	}
405	next;
406	}
407
408	# Not the same as the current text. Output the current text, then
409	# set up a new current.
410
411	&outline($currenttext, $currentref);
412
413	$currenttext = $text;
414	$currentref = $lastref = $ref;
415	}
416
417	# Output the final line and close the file
418
419	&outline($currenttext, $currentref);
420	close(OUT);
421
422	die "** Aborted\n" if $badrange;
423
424	# Format the index
425
426	system("sgcal z-index -to zi-gcode -index /dev/null");
427	system("sgtops zi-gcode -to zi-ps");
428	print "PostScript in zi-ps\n";
429
430	# End