Commit | Line | Data |
---|---|---|
190cf5a4 IK |
1 | #! /usr/bin/perl -w |
2 | # -*- Perl -*- | |
3 | # mknmz - indexer of Namazu | |
4 | # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $ | |
5 | # | |
6 | # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. | |
7 | # Copyright (C) 2000-2008 Namazu Project All rights reserved. | |
8 | # This is free software with ABSOLUTELY NO WARRANTY. | |
9 | # | |
10 | # This program is free software; you can redistribute it and/or modify | |
11 | # it under the terms of the GNU General Public License as published by | |
12 | # the Free Software Foundation; either versions 2, or (at your option) | |
13 | # any later version. | |
14 | # | |
15 | # This program is distributed in the hope that it will be useful | |
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | # GNU General Public License for more details. | |
19 | # | |
20 | # You should have received a copy of the GNU General Public License | |
21 | # along with this program; if not, write to the Free Software | |
22 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA | |
23 | # 02111-1307, USA | |
24 | # | |
25 | # This file must be encoded in EUC-JP encoding | |
26 | # | |
27 | ||
28 | package mknmz; | |
29 | require 5.004; | |
30 | use English; | |
31 | use lib "."; | |
32 | use Cwd; | |
33 | use IO::File; | |
34 | use File::Find; | |
35 | use File::MMagic; | |
36 | use Time::Local; | |
37 | use strict; # be strict since v1.2.0 | |
38 | use Getopt::Long; | |
39 | use File::Copy; | |
40 | use DirHandle; | |
41 | use File::Basename; | |
42 | ||
43 | use vars qw($SYSTEM); | |
44 | # It exists only for back compatibility. | |
45 | $SYSTEM = $English::OSNAME; | |
46 | ||
47 | my $NAMAZU_INDEX_VERSION = "2.0"; | |
48 | ||
49 | my $CodingSystem = "euc"; | |
50 | my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu"; | |
51 | my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in. | |
52 | my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in. | |
53 | my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in. | |
54 | my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in. | |
55 | ||
56 | my $DeletedFilesCount = 0; | |
57 | my $UpdatedFilesCount = 0; | |
58 | my $APPENDMODE = 0; | |
59 | my %PhraseHash = (); | |
60 | my %PhraseHashLast = (); | |
61 | my %KeyIndex = (); | |
62 | my %KeyIndexLast = (); | |
63 | my %CheckPoint = ("on" => undef, "continue" => undef); | |
64 | my $ConfigFile = undef; | |
65 | my $MediaType = undef; | |
66 | ||
67 | my $ReplaceCode = undef; # perl code for transforming URI | |
68 | my @Seed = (); | |
69 | my @LoadedRcfiles = (); | |
70 | my $Magic = new File::MMagic; | |
71 | ||
72 | my $ReceiveTERM = 0; | |
73 | ||
74 | STDOUT->autoflush(1); | |
75 | STDERR->autoflush(1); | |
76 | main(); | |
77 | sub main { | |
78 | my $start_time = time; | |
79 | ||
80 | if ($English::PERL_VERSION == 5.008001) { | |
81 | unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) { | |
82 | print "Run mknmz with the environment variable PERL_HASH_SEED=0\n"; | |
83 | exit 1; | |
84 | } | |
85 | } | |
86 | ||
87 | init(); | |
88 | ||
89 | # At first, loading pl/conf.pl to prevent overriding some variables. | |
90 | preload_modules(); | |
91 | ||
92 | # set LANG and bind textdomain | |
93 | util::set_lang(); | |
94 | textdomain('namazu', $util::LANG_MSG); | |
95 | ||
96 | load_modules(); | |
97 | my ($output_dir, @targets) = parse_options(); | |
98 | my ($docid_base, $total_files_num) = prep($output_dir, @targets); | |
99 | ||
100 | my $swap = 1; | |
101 | my $docid_count = 0; | |
102 | my $file_count = 0; | |
103 | my $total_files_size = 0; | |
104 | my $key_count = 0; | |
105 | my $checkpoint = 0; | |
106 | my $flist_ptr = 0; | |
107 | my $processed_files_size = 0; | |
108 | ||
109 | if ($CheckPoint{'continue'}) { | |
110 | # Restore variables | |
111 | eval util::readfile($var::NMZ{'_checkpoint'}) ; | |
112 | } else { | |
113 | print $total_files_num . _(" files are found to be indexed.\n"); | |
114 | } | |
115 | ||
116 | { | |
117 | my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}"); | |
118 | my $fh_flist = util::efopen($var::NMZ{'_flist'}); | |
119 | my %field_indices = (); | |
120 | get_field_index_base(\%field_indices); | |
121 | ||
122 | if ($CheckPoint{'continue'}) { | |
123 | seek($fh_flist, $flist_ptr, 0); | |
124 | } | |
125 | ||
126 | # Process target files one by one | |
127 | while (defined(my $line = <$fh_flist>)) { | |
128 | $flist_ptr += length($line); | |
129 | my $cfile = $line; | |
130 | chomp $cfile; | |
131 | util::dprint(_("target file: ")."$cfile\n"); | |
132 | ||
133 | my ($cfile_size, $num) = | |
134 | process_file($cfile, $docid_count, $docid_base, | |
135 | $file_count, \%field_indices, | |
136 | $fh_errorsfile, $total_files_num); | |
137 | if ($num == 0) { | |
138 | $total_files_num--; | |
139 | next; | |
140 | } else { | |
141 | $docid_count += $num; | |
142 | $file_count++; | |
143 | } | |
144 | ||
145 | $total_files_size += $cfile_size; | |
146 | $processed_files_size += $cfile_size; | |
147 | last if $ReceiveTERM; | |
148 | if ($processed_files_size > $conf::ON_MEMORY_MAX) { | |
149 | if (%KeyIndex) { | |
150 | $key_count = write_index(); | |
151 | print _("Writing index files..."); | |
152 | write_phrase_hash(); | |
153 | print "\n"; | |
154 | } | |
155 | $processed_files_size = 0; | |
156 | $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>); | |
157 | } | |
158 | } | |
159 | ||
160 | util::fclose($fh_flist); | |
161 | util::fclose($fh_errorsfile); | |
162 | } | |
163 | # This should be out of above blocks because of file handler closing. | |
164 | re_exec($flist_ptr, $docid_count, $docid_base, $start_time, | |
165 | $total_files_size, $total_files_num, | |
166 | $file_count, $key_count) if $checkpoint; | |
167 | ||
168 | if (%KeyIndex) { | |
169 | $key_count = write_index(); | |
170 | print _("Writing index files..."); | |
171 | write_phrase_hash(); | |
172 | print "\n"; | |
173 | } | |
174 | ||
175 | $key_count = get_total_keys() unless $key_count; | |
176 | do_remain_job($total_files_size, $docid_count, $key_count, | |
177 | $start_time); | |
178 | exit 0; | |
179 | } | |
180 | ||
181 | # | |
182 | # FIXME: Very complicated. | |
183 | # | |
184 | sub process_file ($$$$\%$$) { | |
185 | my ($cfile, $docid_count, $docid_base, $file_count, | |
186 | $field_indices, $fh_errorsfile, $total_files_num) = @_; | |
187 | ||
188 | my $processed_num = 0; | |
189 | my $file_size = util::filesize($cfile); | |
190 | ||
191 | if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) { | |
192 | my @parts; | |
193 | @parts = htmlsplit::split($cfile, "NMZ.partial") | |
194 | if ($file_size <= $conf::FILE_SIZE_MAX); | |
195 | if (@parts > 1) { | |
196 | my $id = 0; | |
197 | for my $part (@parts) { | |
198 | next if (defined $conf::EXCLUDE_PATH && | |
199 | "$cfile#$part" =~ /$conf::EXCLUDE_PATH/); | |
200 | my $fname = util::tmpnam("NMZ.partial.$id"); | |
201 | my $fragment = defined $part ? $part : undef; | |
202 | my $uri = generate_uri($cfile, $fragment); | |
203 | my $result = namazu_core($fname, | |
204 | $docid_count + $processed_num, | |
205 | $docid_base, $file_count, | |
206 | $field_indices, $fh_errorsfile, | |
207 | $total_files_num, | |
208 | $uri, $id, $#parts); | |
209 | if ($result > 0) { | |
210 | $processed_num++; | |
211 | my $rname = defined $part ? "$cfile\t$part" : "$cfile"; | |
212 | put_registry($rname); | |
213 | } | |
214 | unlink $fname; | |
215 | $id++; | |
216 | } | |
217 | return ($file_size, $processed_num); | |
218 | } | |
219 | } | |
220 | my $result = namazu_core($cfile, $docid_count, $docid_base, | |
221 | $file_count, $field_indices, | |
222 | $fh_errorsfile, $total_files_num, | |
223 | undef, undef, undef); | |
224 | if ($result > 0) { | |
225 | $processed_num++; | |
226 | put_registry($cfile); | |
227 | } | |
228 | ||
229 | return ($file_size, $processed_num); | |
230 | } | |
231 | ||
232 | # | |
233 | # Load mknmzrcs: | |
234 | # | |
235 | # 1. MKNMZRC environment | |
236 | # | |
237 | # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc | |
238 | # | |
239 | # 3. ~/.mknmzrc | |
240 | # | |
241 | # 4. user-specified mknmzrc set by mknmz --config=file option. | |
242 | # | |
243 | # If multiple files exists, read all of them. | |
244 | # | |
245 | sub load_rcfiles () { | |
246 | my (@cand) = (); | |
247 | ||
248 | # To support Windows. Since they have nasty drive letter convention, | |
249 | # it is necessary to change mknmzrc dynamically with env. variable. | |
250 | push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'}; | |
251 | push @cand, "$CONFDIR/mknmzrc"; | |
252 | push @cand, "$ENV{'HOME'}/.mknmzrc"; | |
253 | ||
254 | util::vprint(_("Reading rcfile: ")); | |
255 | for my $rcfile (@cand) { | |
256 | if (-f $rcfile) { | |
257 | load_rcfile ($rcfile); | |
258 | util::vprint(" $rcfile"); | |
259 | } | |
260 | } | |
261 | util::vprint("\n"); | |
262 | } | |
263 | ||
264 | sub load_rcfile ($) { | |
265 | my ($rcfile) = @_; | |
266 | if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { | |
267 | util::win32_yen_to_slash(\$rcfile); | |
268 | } | |
269 | return if (grep {m/^$rcfile$/} @LoadedRcfiles); | |
270 | do $rcfile; | |
271 | if ($@) { | |
272 | chop $@; | |
273 | push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'"; | |
274 | }else { | |
275 | push @LoadedRcfiles, $rcfile; | |
276 | } | |
277 | ||
278 | # Dirty workaround. | |
279 | $LIBDIR = $conf::LIBDIR | |
280 | if (defined $conf::LIBDIR && -d $conf::LIBDIR); | |
281 | $FILTERDIR = $conf::FILTERDIR | |
282 | if (defined $conf::FILTERDIR && -d $conf::FILTERDIR); | |
283 | $TEMPLATEDIR = $conf::TEMPLATEDIR | |
284 | if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR); | |
285 | } | |
286 | ||
287 | sub re_exec($$$$$$$$) { | |
288 | my ($flist_ptr, $docid_count, $docid_base, $start_time, | |
289 | $total_files_size, $total_files_num, $file_count, $key_count) = @_; | |
290 | ||
291 | # store variables | |
292 | { | |
293 | my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}"); | |
294 | ||
295 | print $fh_checkpoint <<EOM; | |
296 | \$DeletedFilesCount = $DeletedFilesCount; | |
297 | \$UpdatedFilesCount = $UpdatedFilesCount; | |
298 | \$APPENDMODE = $APPENDMODE; | |
299 | \$flist_ptr = $flist_ptr; | |
300 | \$docid_count = $docid_count; | |
301 | \$docid_base = $docid_base; | |
302 | \$start_time = $start_time; | |
303 | \$total_files_size = $total_files_size; | |
304 | \$total_files_num = $total_files_num; | |
305 | \$key_count = $key_count; | |
306 | \$file_count = $file_count; | |
307 | \$\$ = $$; | |
308 | EOM | |
309 | util::fclose($fh_checkpoint); | |
310 | } | |
311 | ||
312 | @ARGV = ("-S", @ARGV) ; | |
313 | print _("Checkpoint reached: re-exec mknmz...\n"); | |
314 | util::dprint(join ' ', ("::::", @ARGV, "\n")); | |
315 | exec ($0, @ARGV) ; | |
316 | } | |
317 | ||
318 | sub put_registry ($) { | |
319 | my ($filename) = @_; | |
320 | my $fh_registry = util::efopen(">>$var::NMZ{'_r'}"); | |
321 | print $fh_registry $filename, "\n"; | |
322 | util::fclose($fh_registry); | |
323 | } | |
324 | ||
325 | ||
326 | # Initialization | |
327 | # $CodingSystem: Character Coding System 'euc' or 'sjis' | |
328 | sub init () { | |
329 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
330 | $CodingSystem = "sjis"; | |
331 | if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { | |
332 | $CONFDIR = $1 . $CONFDIR ; | |
333 | } | |
334 | if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { | |
335 | $LIBDIR = $1 . $LIBDIR ; | |
336 | } | |
337 | if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { | |
338 | $FILTERDIR = $1 . $FILTERDIR ; | |
339 | } | |
340 | if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { | |
341 | $TEMPLATEDIR = $1 . $TEMPLATEDIR ; | |
342 | } | |
343 | } else { | |
344 | $CodingSystem = "euc"; | |
345 | } | |
346 | ||
347 | $SIG{'INT'} = sub { | |
348 | util::cdie("SIGINT caught! Aborted.\n"); | |
349 | }; | |
350 | ||
351 | $SIG{'TERM'} = sub { | |
352 | print STDERR "SIGTERM caught!\n"; | |
353 | $ReceiveTERM = 1; | |
354 | }; | |
355 | } | |
356 | ||
357 | sub preload_modules () { | |
358 | unshift @INC, $LIBDIR; | |
359 | # workaround for test suites. | |
360 | unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'}; | |
361 | ||
362 | require "var.pl" || die "unable to require \"var.pl\"\n"; | |
363 | require "conf.pl" || die "unable to require \"conf.pl\"\n"; | |
364 | require "util.pl" || die "unable to require \"util.pl\"\n"; | |
365 | require "gettext.pl" || die "unable to require \"gettext.pl\"\n"; | |
366 | require "ext.pl" || die "unable to require \"ext.pl\"\n"; | |
367 | } | |
368 | ||
369 | sub postload_modules () { | |
370 | require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n"; | |
371 | } | |
372 | ||
373 | sub load_modules () { | |
374 | require "usage.pl" || die "unable to require \"usage.pl\"\n"; | |
375 | require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n"; | |
376 | require "wakati.pl" || die "unable to require \"wakati.pl\"\n"; | |
377 | require "seed.pl" || die "unable to require \"seed.pl\"\n"; | |
378 | require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n"; | |
379 | ||
380 | @Seed = seed::init(); | |
381 | } | |
382 | ||
383 | sub load_filtermodules () { | |
384 | unshift @INC, $FILTERDIR; | |
385 | ||
386 | # | |
387 | # Windows modules must be loaded first. | |
388 | # Because OLE filters have low precedence over normal ones. | |
389 | # | |
390 | load_win32modules() if $English::OSNAME eq "MSWin32"; | |
391 | ||
392 | # Check filter modules | |
393 | my @filters = (); | |
394 | @filters = glob "$FILTERDIR/*.pl"; | |
395 | ||
396 | load_filters(@filters); | |
397 | } | |
398 | ||
399 | sub load_win32modules () { | |
400 | # Check filter modules | |
401 | my @filters = (); | |
402 | if (-f "../filter/win32/olemsword.pl") { # to ease developing | |
403 | @filters = glob "../filter/win32/*.pl"; | |
404 | unshift @INC, "../filter/win32"; | |
405 | } else { | |
406 | @filters = glob "$FILTERDIR/win32/*.pl"; | |
407 | unshift @INC, "$FILTERDIR/win32"; | |
408 | } | |
409 | ||
410 | load_filters(@filters); | |
411 | } | |
412 | ||
413 | sub load_filters (@) { | |
414 | my @filters = @_; | |
415 | ||
416 | for my $filter (@filters) { | |
417 | $filter =~ m!([-\w]+)\.pl$!; | |
418 | my $module = $1; | |
419 | require "$module.pl" || die "unable to require \"$module.pl\"\n";; | |
420 | my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv); | |
421 | ||
422 | eval "\@mtypes = ${module}::mediatype();"; | |
423 | die $@ if $@; # eval error | |
424 | eval "\$status = ${module}::status();"; | |
425 | die $@ if $@; | |
426 | eval "\$recursive = ${module}::recursive();"; | |
427 | die $@ if $@; | |
428 | eval "\$pre_codeconv = ${module}::pre_codeconv();"; | |
429 | die $@ if $@; | |
430 | eval "\$post_codeconv = ${module}::post_codeconv();"; | |
431 | die $@ if $@; | |
432 | eval "${module}::add_magic(\$Magic);"; | |
433 | die $@ if $@; | |
434 | ||
435 | for my $mt (@mtypes) { | |
436 | next if (defined $var::Supported{$mt} && | |
437 | $var::Supported{$mt} eq 'yes' && $status eq 'no'); | |
438 | $var::Supported{$mt} = $status; | |
439 | $var::REQUIRE_ACTIONS{$mt} = $module; | |
440 | $var::RECURSIVE_ACTIONS{$mt} = $recursive; | |
441 | $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv; | |
442 | $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv; | |
443 | } | |
444 | } | |
445 | } | |
446 | ||
447 | # Core routine. | |
448 | # | |
449 | # FIXME: Too many parameters. They must be cleared. | |
450 | # | |
451 | sub namazu_core ($$$$$$$$$$) { | |
452 | my ($cfile, $docid_count, $docid_base, | |
453 | $file_count, $field_indices, $fh_errorsfile, $total_files_num, | |
454 | $uri, $part_id, $part_num) = @_; | |
455 | ||
456 | my $headings = ""; | |
457 | my $content = ""; | |
458 | my $weighted_str = ""; | |
459 | my %fields; | |
460 | my $msg_prefix; | |
461 | ||
462 | if ($part_id) { | |
463 | $msg_prefix = " $part_id/$part_num - "; | |
464 | } else { | |
465 | $msg_prefix = $file_count + 1 . "/$total_files_num - "; | |
466 | } | |
467 | ||
468 | unless ($uri) { | |
469 | $uri = generate_uri($cfile); # Make a URI from a file name. | |
470 | } | |
471 | my ($cfile_size, $text_size, $kanji, $mtype) = | |
472 | load_document(\$cfile, \$content, \$weighted_str, | |
473 | \$headings, \%fields); | |
474 | ||
475 | { | |
476 | $fields{'mtime'} = (stat($cfile))[9]; | |
477 | my $utc = $fields{'mtime'}; | |
478 | $utc = time::rfc822time_to_mtime($fields{'date'}) | |
479 | if (defined $fields{'date'}); | |
480 | if ($utc == -1) { | |
481 | my $date = $fields{'date'}; | |
482 | print "$cfile Illegal date format. : $date\n"; | |
483 | print $fh_errorsfile "$cfile Illegal date format. : $date\n"; | |
484 | $utc = $fields{'mtime'}; | |
485 | delete $fields{'date'}; | |
486 | } | |
487 | $fields{'utc'} = $utc; | |
488 | } | |
489 | ||
490 | util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n"); | |
491 | ||
492 | # Check if the file is acceptable. | |
493 | my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri); | |
494 | if (defined $err) { | |
495 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
496 | my $uri2 = codeconv::eucjp_to_shiftjis($uri); | |
497 | print $msg_prefix . "$uri2 $err\n"; | |
498 | } else { | |
499 | print $msg_prefix . "$uri $err\n"; | |
500 | } | |
501 | print $fh_errorsfile "$cfile $err\n"; | |
502 | return 0; # return 0 if error | |
503 | } | |
504 | ||
505 | # Print processing file name as URI. | |
506 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
507 | my $uri2 = codeconv::eucjp_to_shiftjis($uri); | |
508 | print $msg_prefix . "$uri2 [$mtype]\n"; | |
509 | } else { | |
510 | print $msg_prefix . "$uri [$mtype]\n"; | |
511 | } | |
512 | ||
513 | # Add filename. | |
514 | my $filename = defined $cfile ? $cfile : ''; | |
515 | codeconv::toeuc(\$filename); | |
516 | $filename = basename($filename); | |
517 | $fields{'filename'} = $filename; | |
518 | ||
519 | complete_field_info(\%fields, $cfile, $uri, | |
520 | \$headings, \$content, \$weighted_str); | |
521 | put_field_index(\%fields, $field_indices); | |
522 | ||
523 | put_dateindex($cfile); | |
524 | $content .= "\n\n$filename\n\n"; # add filename | |
525 | $content .= $weighted_str; # add weights | |
526 | count_words($docid_count, $docid_base, \$content, $kanji); | |
527 | make_phrase_hash($docid_count, $docid_base, \$content); | |
528 | ||
529 | # assertion | |
530 | util::assert($cfile_size != 0, | |
531 | "cfile_size == 0 at the end of namazu_core."); | |
532 | ||
533 | return $cfile_size; | |
534 | } | |
535 | ||
536 | # | |
537 | # Make the URI from the given file name. | |
538 | # | |
539 | sub generate_uri (@) { | |
540 | my ($file, $fragment) = @_; | |
541 | return "" unless defined $file; | |
542 | ||
543 | # omit a file name if omittable | |
544 | $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; | |
545 | ||
546 | if (defined $ReplaceCode) { | |
547 | # transforming URI by evaling | |
548 | $_ = $file; | |
549 | eval $ReplaceCode; | |
550 | $file = $_; | |
551 | } | |
552 | ||
553 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
554 | $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C| | |
555 | } | |
556 | ||
557 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
558 | $file = codeconv::shiftjis_to_eucjp($file); | |
559 | } | |
560 | if (defined $fragment) { | |
561 | codeconv::toeuc(\$fragment); | |
562 | } | |
563 | ||
564 | unless ($var::Opt{'noencodeuri'}) { | |
565 | for my $tmp ($file, $fragment) { | |
566 | next unless defined $tmp; | |
567 | ||
568 | # Escape unsafe characters (not strict) | |
569 | $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2 | |
570 | $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/ | |
571 | sprintf("%%%02X",ord($1))/ge; | |
572 | } | |
573 | } | |
574 | ||
575 | ||
576 | my $uri = $file; | |
577 | $uri .= "#" . $fragment if defined $fragment; | |
578 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { | |
579 | # restore '|' for drive letter rule of Win32, OS/2 | |
580 | $uri =~ s!^/([A-Z])%7C!/$1|!i; | |
581 | } | |
582 | return $uri; | |
583 | } | |
584 | ||
585 | ||
586 | sub get_field_index_base (\%) { | |
587 | my ($field_indices) = @_; | |
588 | ||
589 | my @keys = split('\|', $conf::SEARCH_FIELD); | |
590 | if ($var::Opt{'meta'}) { | |
591 | push @keys, (split '\|', $conf::META_TAGS); | |
592 | } | |
593 | for my $key (@keys) { | |
594 | $key = lc($key); | |
595 | my $fname = "$var::NMZ{'field'}.$key"; | |
596 | my $tmp_fname = util::tmpnam("NMZ.field.$key"); | |
597 | my $size = 0; | |
598 | $size = -s $fname if -f $fname; | |
599 | $size += -s $tmp_fname if -f $tmp_fname; | |
600 | $field_indices->{$key} = $size; | |
601 | } | |
602 | } | |
603 | ||
604 | sub complete_field_info (\%$$\$\$\$) { | |
605 | my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_; | |
606 | ||
607 | for my $field (keys %{$fields}) { | |
608 | if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) { | |
609 | delete $fields->{$field}; | |
610 | } | |
611 | } | |
612 | ||
613 | unless (defined($fields->{'title'})) { | |
614 | $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref); | |
615 | } | |
616 | unless (defined($fields->{'date'})) { | |
617 | my $mtime = $fields->{'mtime'}; | |
618 | my $date = util::rfc822time($mtime); | |
619 | $fields->{'date'} = $date; | |
620 | } | |
621 | unless (defined($fields->{'uri'})) { | |
622 | $fields->{'uri'} = $uri; | |
623 | } | |
624 | unless (defined($fields->{'size'})) { | |
625 | $fields->{'size'} = -s $cfile; | |
626 | } | |
627 | unless (defined($fields->{'summary'})) { | |
628 | $fields->{'summary'} = make_summary($contref, $headings, $cfile); | |
629 | } | |
630 | unless (defined($fields->{'from'}) || defined($fields->{'author'})) { | |
631 | $fields->{'from'} = getmsg("unknown"); | |
632 | } | |
633 | } | |
634 | ||
635 | # | |
636 | # Currently, messages for NMZ.* files should be encoded in | |
637 | # EUC-JP currently. We cannot use gettext.pl for the messsage | |
638 | # because gettext.pl may use Shift_JIS encoded messages. | |
639 | # So, we should use the function instead of gettext(). | |
640 | # | |
641 | # FIXME: Ad hoc impl. getmsg() is effective only for "unknown". | |
642 | # | |
643 | sub getmsg($) { | |
644 | my ($msg) = @_; | |
645 | ||
646 | if (util::islang_msg("ja")) { | |
647 | if ($msg eq "unknown") { | |
648 |