finish incomplete fix for utf8 on index pages
[mharc.git] / bin / mknmz
CommitLineData
190cf5a4
IK
1#! /usr/bin/perl -w
2# -*- Perl -*-
3# mknmz - indexer of Namazu
4# $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
5#
6# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
7# Copyright (C) 2000-2008 Namazu Project All rights reserved.
8# This is free software with ABSOLUTELY NO WARRANTY.
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either versions 2, or (at your option)
13# any later version.
14#
15# This program is distributed in the hope that it will be useful
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23# 02111-1307, USA
24#
25# This file must be encoded in EUC-JP encoding
26#
27
28package mknmz;
29require 5.004;
30use English;
31use lib ".";
32use Cwd;
33use IO::File;
34use File::Find;
35use File::MMagic;
36use Time::Local;
37use strict; # be strict since v1.2.0
38use Getopt::Long;
39use File::Copy;
40use DirHandle;
41use File::Basename;
42
43use vars qw($SYSTEM);
44# It exists only for back compatibility.
45$SYSTEM = $English::OSNAME;
46
47my $NAMAZU_INDEX_VERSION = "2.0";
48
49my $CodingSystem = "euc";
50my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu";
51my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in.
52my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
53my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
54my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
55
56my $DeletedFilesCount = 0;
57my $UpdatedFilesCount = 0;
58my $APPENDMODE = 0;
59my %PhraseHash = ();
60my %PhraseHashLast = ();
61my %KeyIndex = ();
62my %KeyIndexLast = ();
63my %CheckPoint = ("on" => undef, "continue" => undef);
64my $ConfigFile = undef;
65my $MediaType = undef;
66
67my $ReplaceCode = undef; # perl code for transforming URI
68my @Seed = ();
69my @LoadedRcfiles = ();
70my $Magic = new File::MMagic;
71
72my $ReceiveTERM = 0;
73
74STDOUT->autoflush(1);
75STDERR->autoflush(1);
76main();
77sub main {
78 my $start_time = time;
79
80 if ($English::PERL_VERSION == 5.008001) {
81 unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
82 print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
83 exit 1;
84 }
85 }
86
87 init();
88
89 # At first, loading pl/conf.pl to prevent overriding some variables.
90 preload_modules();
91
92 # set LANG and bind textdomain
93 util::set_lang();
94 textdomain('namazu', $util::LANG_MSG);
95
96 load_modules();
97 my ($output_dir, @targets) = parse_options();
98 my ($docid_base, $total_files_num) = prep($output_dir, @targets);
99
100 my $swap = 1;
101 my $docid_count = 0;
102 my $file_count = 0;
103 my $total_files_size = 0;
104 my $key_count = 0;
105 my $checkpoint = 0;
106 my $flist_ptr = 0;
107 my $processed_files_size = 0;
108
109 if ($CheckPoint{'continue'}) {
110 # Restore variables
111 eval util::readfile($var::NMZ{'_checkpoint'}) ;
112 } else {
113 print $total_files_num . _(" files are found to be indexed.\n");
114 }
115
116 {
117 my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
118 my $fh_flist = util::efopen($var::NMZ{'_flist'});
119 my %field_indices = ();
120 get_field_index_base(\%field_indices);
121
122 if ($CheckPoint{'continue'}) {
123 seek($fh_flist, $flist_ptr, 0);
124 }
125
126 # Process target files one by one
127 while (defined(my $line = <$fh_flist>)) {
128 $flist_ptr += length($line);
129 my $cfile = $line;
130 chomp $cfile;
131 util::dprint(_("target file: ")."$cfile\n");
132
133 my ($cfile_size, $num) =
134 process_file($cfile, $docid_count, $docid_base,
135 $file_count, \%field_indices,
136 $fh_errorsfile, $total_files_num);
137 if ($num == 0) {
138 $total_files_num--;
139 next;
140 } else {
141 $docid_count += $num;
142 $file_count++;
143 }
144
145 $total_files_size += $cfile_size;
146 $processed_files_size += $cfile_size;
147 last if $ReceiveTERM;
148 if ($processed_files_size > $conf::ON_MEMORY_MAX) {
149 if (%KeyIndex) {
150 $key_count = write_index();
151 print _("Writing index files...");
152 write_phrase_hash();
153 print "\n";
154 }
155 $processed_files_size = 0;
156 $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
157 }
158 }
159
160 util::fclose($fh_flist);
161 util::fclose($fh_errorsfile);
162 }
163 # This should be out of above blocks because of file handler closing.
164 re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
165 $total_files_size, $total_files_num,
166 $file_count, $key_count) if $checkpoint;
167
168 if (%KeyIndex) {
169 $key_count = write_index();
170 print _("Writing index files...");
171 write_phrase_hash();
172 print "\n";
173 }
174
175 $key_count = get_total_keys() unless $key_count;
176 do_remain_job($total_files_size, $docid_count, $key_count,
177 $start_time);
178 exit 0;
179}
180
181#
182# FIXME: Very complicated.
183#
184sub process_file ($$$$\%$$) {
185 my ($cfile, $docid_count, $docid_base, $file_count,
186 $field_indices, $fh_errorsfile, $total_files_num) = @_;
187
188 my $processed_num = 0;
189 my $file_size = util::filesize($cfile);
190
191 if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
192 my @parts;
193 @parts = htmlsplit::split($cfile, "NMZ.partial")
194 if ($file_size <= $conf::FILE_SIZE_MAX);
195 if (@parts > 1) {
196 my $id = 0;
197 for my $part (@parts) {
198 next if (defined $conf::EXCLUDE_PATH &&
199 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
200 my $fname = util::tmpnam("NMZ.partial.$id");
201 my $fragment = defined $part ? $part : undef;
202 my $uri = generate_uri($cfile, $fragment);
203 my $result = namazu_core($fname,
204 $docid_count + $processed_num,
205 $docid_base, $file_count,
206 $field_indices, $fh_errorsfile,
207 $total_files_num,
208 $uri, $id, $#parts);
209 if ($result > 0) {
210 $processed_num++;
211 my $rname = defined $part ? "$cfile\t$part" : "$cfile";
212 put_registry($rname);
213 }
214 unlink $fname;
215 $id++;
216 }
217 return ($file_size, $processed_num);
218 }
219 }
220 my $result = namazu_core($cfile, $docid_count, $docid_base,
221 $file_count, $field_indices,
222 $fh_errorsfile, $total_files_num,
223 undef, undef, undef);
224 if ($result > 0) {
225 $processed_num++;
226 put_registry($cfile);
227 }
228
229 return ($file_size, $processed_num);
230}
231
232#
233# Load mknmzrcs:
234#
235# 1. MKNMZRC environment
236#
237# 2. $(sysconfdir)/$(PACKAGE)/mknmzrc
238#
239# 3. ~/.mknmzrc
240#
241# 4. user-specified mknmzrc set by mknmz --config=file option.
242#
243# If multiple files exists, read all of them.
244#
245sub load_rcfiles () {
246 my (@cand) = ();
247
248 # To support Windows. Since they have nasty drive letter convention,
249 # it is necessary to change mknmzrc dynamically with env. variable.
250 push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
251 push @cand, "$CONFDIR/mknmzrc";
252 push @cand, "$ENV{'HOME'}/.mknmzrc";
253
254 util::vprint(_("Reading rcfile: "));
255 for my $rcfile (@cand) {
256 if (-f $rcfile) {
257 load_rcfile ($rcfile);
258 util::vprint(" $rcfile");
259 }
260 }
261 util::vprint("\n");
262}
263
264sub load_rcfile ($) {
265 my ($rcfile) = @_;
266 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
267 util::win32_yen_to_slash(\$rcfile);
268 }
269 return if (grep {m/^$rcfile$/} @LoadedRcfiles);
270 do $rcfile;
271 if ($@) {
272 chop $@;
273 push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
274 }else {
275 push @LoadedRcfiles, $rcfile;
276 }
277
278 # Dirty workaround.
279 $LIBDIR = $conf::LIBDIR
280 if (defined $conf::LIBDIR && -d $conf::LIBDIR);
281 $FILTERDIR = $conf::FILTERDIR
282 if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
283 $TEMPLATEDIR = $conf::TEMPLATEDIR
284 if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
285}
286
287sub re_exec($$$$$$$$) {
288 my ($flist_ptr, $docid_count, $docid_base, $start_time,
289 $total_files_size, $total_files_num, $file_count, $key_count) = @_;
290
291 # store variables
292 {
293 my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
294
295 print $fh_checkpoint <<EOM;
296 \$DeletedFilesCount = $DeletedFilesCount;
297 \$UpdatedFilesCount = $UpdatedFilesCount;
298 \$APPENDMODE = $APPENDMODE;
299 \$flist_ptr = $flist_ptr;
300 \$docid_count = $docid_count;
301 \$docid_base = $docid_base;
302 \$start_time = $start_time;
303 \$total_files_size = $total_files_size;
304 \$total_files_num = $total_files_num;
305 \$key_count = $key_count;
306 \$file_count = $file_count;
307 \$\$ = $$;
308EOM
309 util::fclose($fh_checkpoint);
310 }
311
312 @ARGV = ("-S", @ARGV) ;
313 print _("Checkpoint reached: re-exec mknmz...\n");
314 util::dprint(join ' ', ("::::", @ARGV, "\n"));
315 exec ($0, @ARGV) ;
316}
317
318sub put_registry ($) {
319 my ($filename) = @_;
320 my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
321 print $fh_registry $filename, "\n";
322 util::fclose($fh_registry);
323}
324
325
326# Initialization
327# $CodingSystem: Character Coding System 'euc' or 'sjis'
328sub init () {
329 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
330 $CodingSystem = "sjis";
331 if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
332 $CONFDIR = $1 . $CONFDIR ;
333 }
334 if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
335 $LIBDIR = $1 . $LIBDIR ;
336 }
337 if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
338 $FILTERDIR = $1 . $FILTERDIR ;
339 }
340 if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
341 $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
342 }
343 } else {
344 $CodingSystem = "euc";
345 }
346
347 $SIG{'INT'} = sub {
348 util::cdie("SIGINT caught! Aborted.\n");
349 };
350
351 $SIG{'TERM'} = sub {
352 print STDERR "SIGTERM caught!\n";
353 $ReceiveTERM = 1;
354 };
355}
356
357sub preload_modules () {
358 unshift @INC, $LIBDIR;
359 # workaround for test suites.
360 unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
361
362 require "var.pl" || die "unable to require \"var.pl\"\n";
363 require "conf.pl" || die "unable to require \"conf.pl\"\n";
364 require "util.pl" || die "unable to require \"util.pl\"\n";
365 require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
366 require "ext.pl" || die "unable to require \"ext.pl\"\n";
367}
368
369sub postload_modules () {
370 require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
371}
372
373sub load_modules () {
374 require "usage.pl" || die "unable to require \"usage.pl\"\n";
375 require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
376 require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
377 require "seed.pl" || die "unable to require \"seed.pl\"\n";
378 require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
379
380 @Seed = seed::init();
381}
382
383sub load_filtermodules () {
384 unshift @INC, $FILTERDIR;
385
386 #
387 # Windows modules must be loaded first.
388 # Because OLE filters have low precedence over normal ones.
389 #
390 load_win32modules() if $English::OSNAME eq "MSWin32";
391
392 # Check filter modules
393 my @filters = ();
394 @filters = glob "$FILTERDIR/*.pl";
395
396 load_filters(@filters);
397}
398
399sub load_win32modules () {
400 # Check filter modules
401 my @filters = ();
402 if (-f "../filter/win32/olemsword.pl") { # to ease developing
403 @filters = glob "../filter/win32/*.pl";
404 unshift @INC, "../filter/win32";
405 } else {
406 @filters = glob "$FILTERDIR/win32/*.pl";
407 unshift @INC, "$FILTERDIR/win32";
408 }
409
410 load_filters(@filters);
411}
412
413sub load_filters (@) {
414 my @filters = @_;
415
416 for my $filter (@filters) {
417 $filter =~ m!([-\w]+)\.pl$!;
418 my $module = $1;
419 require "$module.pl" || die "unable to require \"$module.pl\"\n";;
420 my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
421
422 eval "\@mtypes = ${module}::mediatype();";
423 die $@ if $@; # eval error
424 eval "\$status = ${module}::status();";
425 die $@ if $@;
426 eval "\$recursive = ${module}::recursive();";
427 die $@ if $@;
428 eval "\$pre_codeconv = ${module}::pre_codeconv();";
429 die $@ if $@;
430 eval "\$post_codeconv = ${module}::post_codeconv();";
431 die $@ if $@;
432 eval "${module}::add_magic(\$Magic);";
433 die $@ if $@;
434
435 for my $mt (@mtypes) {
436 next if (defined $var::Supported{$mt} &&
437 $var::Supported{$mt} eq 'yes' && $status eq 'no');
438 $var::Supported{$mt} = $status;
439 $var::REQUIRE_ACTIONS{$mt} = $module;
440 $var::RECURSIVE_ACTIONS{$mt} = $recursive;
441 $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
442 $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
443 }
444 }
445}
446
447# Core routine.
448#
449# FIXME: Too many parameters. They must be cleared.
450#
451sub namazu_core ($$$$$$$$$$) {
452 my ($cfile, $docid_count, $docid_base,
453 $file_count, $field_indices, $fh_errorsfile, $total_files_num,
454 $uri, $part_id, $part_num) = @_;
455
456 my $headings = "";
457 my $content = "";
458 my $weighted_str = "";
459 my %fields;
460 my $msg_prefix;
461
462 if ($part_id) {
463 $msg_prefix = " $part_id/$part_num - ";
464 } else {
465 $msg_prefix = $file_count + 1 . "/$total_files_num - ";
466 }
467
468 unless ($uri) {
469 $uri = generate_uri($cfile); # Make a URI from a file name.
470 }
471 my ($cfile_size, $text_size, $kanji, $mtype) =
472 load_document(\$cfile, \$content, \$weighted_str,
473 \$headings, \%fields);
474
475 {
476 $fields{'mtime'} = (stat($cfile))[9];
477 my $utc = $fields{'mtime'};
478 $utc = time::rfc822time_to_mtime($fields{'date'})
479 if (defined $fields{'date'});
480 if ($utc == -1) {
481 my $date = $fields{'date'};
482 print "$cfile Illegal date format. : $date\n";
483 print $fh_errorsfile "$cfile Illegal date format. : $date\n";
484 $utc = $fields{'mtime'};
485 delete $fields{'date'};
486 }
487 $fields{'utc'} = $utc;
488 }
489
490 util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
491
492 # Check if the file is acceptable.
493 my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
494 if (defined $err) {
495 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
496 my $uri2 = codeconv::eucjp_to_shiftjis($uri);
497 print $msg_prefix . "$uri2 $err\n";
498 } else {
499 print $msg_prefix . "$uri $err\n";
500 }
501 print $fh_errorsfile "$cfile $err\n";
502 return 0; # return 0 if error
503 }
504
505 # Print processing file name as URI.
506 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
507 my $uri2 = codeconv::eucjp_to_shiftjis($uri);
508 print $msg_prefix . "$uri2 [$mtype]\n";
509 } else {
510 print $msg_prefix . "$uri [$mtype]\n";
511 }
512
513 # Add filename.
514 my $filename = defined $cfile ? $cfile : '';
515 codeconv::toeuc(\$filename);
516 $filename = basename($filename);
517 $fields{'filename'} = $filename;
518
519 complete_field_info(\%fields, $cfile, $uri,
520 \$headings, \$content, \$weighted_str);
521 put_field_index(\%fields, $field_indices);
522
523 put_dateindex($cfile);
524 $content .= "\n\n$filename\n\n"; # add filename
525 $content .= $weighted_str; # add weights
526 count_words($docid_count, $docid_base, \$content, $kanji);
527 make_phrase_hash($docid_count, $docid_base, \$content);
528
529 # assertion
530 util::assert($cfile_size != 0,
531 "cfile_size == 0 at the end of namazu_core.");
532
533 return $cfile_size;
534}
535
536#
537# Make the URI from the given file name.
538#
539sub generate_uri (@) {
540 my ($file, $fragment) = @_;
541 return "" unless defined $file;
542
543 # omit a file name if omittable
544 $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
545
546 if (defined $ReplaceCode) {
547 # transforming URI by evaling
548 $_ = $file;
549 eval $ReplaceCode;
550 $file = $_;
551 }
552
553 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
554 $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
555 }
556
557 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
558 $file = codeconv::shiftjis_to_eucjp($file);
559 }
560 if (defined $fragment) {
561 codeconv::toeuc(\$fragment);
562 }
563
564 unless ($var::Opt{'noencodeuri'}) {
565 for my $tmp ($file, $fragment) {
566 next unless defined $tmp;
567
568 # Escape unsafe characters (not strict)
569 $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2
570 $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
571 sprintf("%%%02X",ord($1))/ge;
572 }
573 }
574
575
576 my $uri = $file;
577 $uri .= "#" . $fragment if defined $fragment;
578 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
579 # restore '|' for drive letter rule of Win32, OS/2
580 $uri =~ s!^/([A-Z])%7C!/$1|!i;
581 }
582 return $uri;
583}
584
585
586sub get_field_index_base (\%) {
587 my ($field_indices) = @_;
588
589 my @keys = split('\|', $conf::SEARCH_FIELD);
590 if ($var::Opt{'meta'}) {
591 push @keys, (split '\|', $conf::META_TAGS);
592 }
593 for my $key (@keys) {
594 $key = lc($key);
595 my $fname = "$var::NMZ{'field'}.$key";
596 my $tmp_fname = util::tmpnam("NMZ.field.$key");
597 my $size = 0;
598 $size = -s $fname if -f $fname;
599 $size += -s $tmp_fname if -f $tmp_fname;
600 $field_indices->{$key} = $size;
601 }
602}
603
604sub complete_field_info (\%$$\$\$\$) {
605 my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
606
607 for my $field (keys %{$fields}) {
608 if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
609 delete $fields->{$field};
610 }
611 }
612
613 unless (defined($fields->{'title'})) {
614 $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
615 }
616 unless (defined($fields->{'date'})) {
617 my $mtime = $fields->{'mtime'};
618 my $date = util::rfc822time($mtime);
619 $fields->{'date'} = $date;
620 }
621 unless (defined($fields->{'uri'})) {
622 $fields->{'uri'} = $uri;
623 }
624 unless (defined($fields->{'size'})) {
625 $fields->{'size'} = -s $cfile;
626 }
627 unless (defined($fields->{'summary'})) {
628 $fields->{'summary'} = make_summary($contref, $headings, $cfile);
629 }
630 unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
631 $fields->{'from'} = getmsg("unknown");
632 }
633}
634
635#
636# Currently, messages for NMZ.* files should be encoded in
637# EUC-JP currently. We cannot use gettext.pl for the messsage
638# because gettext.pl may use Shift_JIS encoded messages.
639# So, we should use the function instead of gettext().
640#
641# FIXME: Ad hoc impl. getmsg() is effective only for "unknown".
642#
643sub getmsg($) {
644 my ($msg) = @_;
645
646 if (util::islang_msg("ja")) {
647 if ($msg eq "unknown") {
648