Commit | Line | Data |
---|---|---|
168e428f PH |
1 | #! /usr/bin/perl |
2 | ||
068aaea8 | 3 | # $Cambridge: exim/doc/doc-docbook/Pre-xml,v 1.2 2005/11/10 12:30:13 ph10 Exp $ |
168e428f PH |
4 | |
5 | # Script to pre-process XML input before processing it for various purposes. | |
6 | # Options specify which transformations are to be done. Monospaced literal | |
7 | # layout blocks are never touched. | |
8 | ||
9 | # Changes: | |
10 | ||
11 | # -abstract: Remove the <abstract> element | |
12 | ||
13 | # -ascii: Replace &8230; (sic, no x) with ... | |
14 | # Replace ’ by ' | |
15 | # Replace “ by " | |
16 | # Replace ” by " | |
17 | # Replace – by - | |
18 | # Replace † by * | |
19 | # Replace ‡ by ** | |
20 | # Replace   by a space | |
21 | # Replace © by (c) | |
22 | # Put quotes round <literal> text | |
23 | # Put quotes round <quote> text | |
068aaea8 | 24 | # |
168e428f | 25 | # -bookinfo: Remove the <bookinfo> element from the file |
068aaea8 | 26 | # |
168e428f PH |
27 | # -fi: Replace "fi" by fi except when it is in an XML element, or |
28 | # inside a <literal>. | |
068aaea8 PH |
29 | # |
30 | # -html: Certain things are done only for HTML output: | |
31 | # | |
32 | # If <literallayout> is followed by optional # space and then a | |
33 | # newline, the space and newline are removed, because otherwise you | |
34 | # get a blank line in the HTML output. | |
35 | # | |
168e428f PH |
36 | # -noindex Remove the XML to generate a Concept and an Options index. |
37 | # -oneindex Ditto, but add XML to generate a single index. | |
38 | ||
39 | ||
40 | ||
41 | # The function that processes non-literal monospaced text | |
42 | ||
43 | sub process() | |
44 | { | |
45 | my($s) = $_[0]; | |
46 | ||
47 | $s =~ s/fi(?![^<>]*>)/fi/g if $ligatures; | |
48 | ||
49 | if ($ascii) | |
50 | { | |
51 | $s =~ s/…/.../g; | |
52 | $s =~ s/’/'/g; | |
53 | $s =~ s/“/"/g; | |
54 | $s =~ s/”/"/g; | |
55 | $s =~ s/–/-/g; | |
56 | $s =~ s/†/*/g; | |
57 | $s =~ s/‡/**/g; | |
58 | $s =~ s/ / /g; | |
068aaea8 | 59 | $s =~ s/©/(c)/g; |
168e428f PH |
60 | $s =~ s/<quote>/"/g; |
61 | $s =~ s/<\/quote>/"/g; | |
62 | } | |
63 | ||
64 | $s; | |
65 | } | |
66 | ||
67 | ||
68 | # The main program | |
69 | ||
70 | $abstract = 0; | |
71 | $ascii = 0; | |
72 | $bookinfo = 0; | |
068aaea8 | 73 | $html = 0; |
168e428f | 74 | $inliteral = 0; |
068aaea8 | 75 | $inliterallayout = 0; |
168e428f PH |
76 | $ligatures = 0; |
77 | $madeindex = 0; | |
78 | $noindex = 0; | |
79 | $oneindex = 0; | |
80 | ||
81 | foreach $arg (@ARGV) | |
82 | { | |
83 | if ($arg eq "-fi") { $ligatures = 1; } | |
84 | elsif ($arg eq "-abstract") { $abstract = 1; } | |
85 | elsif ($arg eq "-ascii") { $ascii = 1; } | |
86 | elsif ($arg eq "-bookinfo") { $bookinfo = 1; } | |
068aaea8 | 87 | elsif ($arg eq "-html") { $html = 1; } |
168e428f PH |
88 | elsif ($arg eq "-noindex") { $noindex = 1; } |
89 | elsif ($arg eq "-oneindex") { $oneindex = 1; } | |
90 | else { die "** Pre-xml: Unknown option \"$arg\"\n"; } | |
91 | } | |
92 | ||
93 | while (<STDIN>) | |
94 | { | |
95 | # Remove <abstract> if required | |
96 | ||
97 | next if ($abstract && /^\s*<abstract>/); | |
98 | ||
99 | # Remove <bookinfo> if required | |
100 | ||
101 | if ($bookinfo && /^<bookinfo/) | |
102 | { | |
103 | while (<STDIN>) { last if /^<\/bookinfo/; } | |
104 | next; | |
105 | } | |
106 | ||
107 | # Copy monospaced literallayout blocks | |
108 | ||
109 | if (/^<literallayout class="monospaced">/) | |
110 | { | |
068aaea8 | 111 | $_ = substr($_, 0, -1) if $html; |
168e428f PH |
112 | print; |
113 | while (<STDIN>) | |
114 | { | |
115 | print; | |
116 | last if /^<\/literallayout>/; | |
117 | } | |
118 | next; | |
119 | } | |
120 | ||
121 | # Adjust index-generation code if required | |
122 | ||
123 | if (($noindex || $oneindex) && /^<index[\s>]/) | |
124 | { | |
125 | while (<STDIN>) | |
126 | { | |
127 | last if /^<\/index>/; | |
128 | } | |
129 | ||
130 | if ($oneindex && !$madeindex) | |
131 | { | |
132 | $madeindex = 1; | |
133 | print "<index><title>Index</title></index>\n"; | |
134 | } | |
135 | ||
136 | next; | |
137 | } | |
138 | ||
139 | # A line that is not in a monospaced literal block; keep track of which | |
140 | # parts are in <literal> and which not. The latter get processed by the | |
068aaea8 PH |
141 | # function above. Items in <literal> get quoted unless they are also in |
142 | # a <literallayout> block, or are already being quoted. | |
168e428f PH |
143 | |
144 | for (;;) | |
145 | { | |
068aaea8 PH |
146 | $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/; |
147 | $inliterallayout = 1 if /^<literallayout/; | |
148 | $inliterallayout = 0 if /^<\/literallayout/; | |
149 | ||
168e428f PH |
150 | if ($inliteral) |
151 | { | |
068aaea8 | 152 | if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/) |
168e428f PH |
153 | { |
154 | print $1; | |
068aaea8 | 155 | print "\"" if $ascii && !$inliterallayout; |
168e428f PH |
156 | print "</literal>"; |
157 | $inliteral = 0; | |
158 | $_ = "$2\n"; | |
159 | } | |
160 | else | |
161 | { | |
162 | print; | |
163 | last; | |
164 | } | |
165 | } | |
166 | ||
167 | # Not in literal state | |
168 | ||
169 | else | |
170 | { | |
068aaea8 | 171 | if (/^(.*?)(?<!<quote>)<literal>(.*)$/) |
168e428f PH |
172 | { |
173 | print &process($1); | |
174 | print "<literal>"; | |
068aaea8 | 175 | print "\"" if $ascii && !$inliterallayout; |
168e428f PH |
176 | $inliteral = 1; |
177 | $_ = "$2\n"; | |
178 | } | |
179 | else | |
180 | { | |
181 | print &process($_); | |
182 | last; | |
183 | } | |
184 | } | |
185 | } # Loop for different parts of one line | |
186 | } # Loop for multiple lines | |
187 | ||
188 | # End |