fix mbox archive link
[mharc.git] / bin / mknmz
1 #! /usr/bin/perl -w
2 # -*- Perl -*-
3 # mknmz - indexer of Namazu
4 # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
5 #
6 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
7 # Copyright (C) 2000-2008 Namazu Project All rights reserved.
8 # This is free software with ABSOLUTELY NO WARRANTY.
9 #
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either versions 2, or (at your option)
13 # any later version.
14 #
15 # This program is distributed in the hope that it will be useful
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, write to the Free Software
22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 # 02111-1307, USA
24 #
25 # This file must be encoded in EUC-JP encoding
26 #
27
28 package mknmz;
29 require 5.004;
30 use English;
31 use lib ".";
32 use Cwd;
33 use IO::File;
34 use File::Find;
35 use File::MMagic;
36 use Time::Local;
37 use strict; # be strict since v1.2.0
38 use Getopt::Long;
39 use File::Copy;
40 use DirHandle;
41 use File::Basename;
42
43 use vars qw($SYSTEM);
44 # It exists only for back compatibility.
45 $SYSTEM = $English::OSNAME;
46
47 my $NAMAZU_INDEX_VERSION = "2.0";
48
49 my $CodingSystem = "euc";
50 my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu";
51 my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in.
52 my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
53 my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
54 my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
55
56 my $DeletedFilesCount = 0;
57 my $UpdatedFilesCount = 0;
58 my $APPENDMODE = 0;
59 my %PhraseHash = ();
60 my %PhraseHashLast = ();
61 my %KeyIndex = ();
62 my %KeyIndexLast = ();
63 my %CheckPoint = ("on" => undef, "continue" => undef);
64 my $ConfigFile = undef;
65 my $MediaType = undef;
66
67 my $ReplaceCode = undef; # perl code for transforming URI
68 my @Seed = ();
69 my @LoadedRcfiles = ();
70 my $Magic = new File::MMagic;
71
72 my $ReceiveTERM = 0;
73
74 STDOUT->autoflush(1);
75 STDERR->autoflush(1);
76 main();
77 sub main {
78 my $start_time = time;
79
80 if ($English::PERL_VERSION == 5.008001) {
81 unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
82 print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
83 exit 1;
84 }
85 }
86
87 init();
88
89 # At first, loading pl/conf.pl to prevent overriding some variables.
90 preload_modules();
91
92 # set LANG and bind textdomain
93 util::set_lang();
94 textdomain('namazu', $util::LANG_MSG);
95
96 load_modules();
97 my ($output_dir, @targets) = parse_options();
98 my ($docid_base, $total_files_num) = prep($output_dir, @targets);
99
100 my $swap = 1;
101 my $docid_count = 0;
102 my $file_count = 0;
103 my $total_files_size = 0;
104 my $key_count = 0;
105 my $checkpoint = 0;
106 my $flist_ptr = 0;
107 my $processed_files_size = 0;
108
109 if ($CheckPoint{'continue'}) {
110 # Restore variables
111 eval util::readfile($var::NMZ{'_checkpoint'}) ;
112 } else {
113 print $total_files_num . _(" files are found to be indexed.\n");
114 }
115
116 {
117 my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
118 my $fh_flist = util::efopen($var::NMZ{'_flist'});
119 my %field_indices = ();
120 get_field_index_base(\%field_indices);
121
122 if ($CheckPoint{'continue'}) {
123 seek($fh_flist, $flist_ptr, 0);
124 }
125
126 # Process target files one by one
127 while (defined(my $line = <$fh_flist>)) {
128 $flist_ptr += length($line);
129 my $cfile = $line;
130 chomp $cfile;
131 util::dprint(_("target file: ")."$cfile\n");
132
133 my ($cfile_size, $num) =
134 process_file($cfile, $docid_count, $docid_base,
135 $file_count, \%field_indices,
136 $fh_errorsfile, $total_files_num);
137 if ($num == 0) {
138 $total_files_num--;
139 next;
140 } else {
141 $docid_count += $num;
142 $file_count++;
143 }
144
145 $total_files_size += $cfile_size;
146 $processed_files_size += $cfile_size;
147 last if $ReceiveTERM;
148 if ($processed_files_size > $conf::ON_MEMORY_MAX) {
149 if (%KeyIndex) {
150 $key_count = write_index();
151 print _("Writing index files...");
152 write_phrase_hash();
153 print "\n";
154 }
155 $processed_files_size = 0;
156 $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
157 }
158 }
159
160 util::fclose($fh_flist);
161 util::fclose($fh_errorsfile);
162 }
163 # This should be out of above blocks because of file handler closing.
164 re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
165 $total_files_size, $total_files_num,
166 $file_count, $key_count) if $checkpoint;
167
168 if (%KeyIndex) {
169 $key_count = write_index();
170 print _("Writing index files...");
171 write_phrase_hash();
172 print "\n";
173 }
174
175 $key_count = get_total_keys() unless $key_count;
176 do_remain_job($total_files_size, $docid_count, $key_count,
177 $start_time);
178 exit 0;
179 }
180
181 #
182 # FIXME: Very complicated.
183 #
184 sub process_file ($$$$\%$$) {
185 my ($cfile, $docid_count, $docid_base, $file_count,
186 $field_indices, $fh_errorsfile, $total_files_num) = @_;
187
188 my $processed_num = 0;
189 my $file_size = util::filesize($cfile);
190
191 if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
192 my @parts;
193 @parts = htmlsplit::split($cfile, "NMZ.partial")
194 if ($file_size <= $conf::FILE_SIZE_MAX);
195 if (@parts > 1) {
196 my $id = 0;
197 for my $part (@parts) {
198 next if (defined $conf::EXCLUDE_PATH &&
199 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
200 my $fname = util::tmpnam("NMZ.partial.$id");
201 my $fragment = defined $part ? $part : undef;
202 my $uri = generate_uri($cfile, $fragment);
203 my $result = namazu_core($fname,
204 $docid_count + $processed_num,
205 $docid_base, $file_count,
206 $field_indices, $fh_errorsfile,
207 $total_files_num,
208 $uri, $id, $#parts);
209 if ($result > 0) {
210 $processed_num++;
211 my $rname = defined $part ? "$cfile\t$part" : "$cfile";
212 put_registry($rname);
213 }
214 unlink $fname;
215 $id++;
216 }
217 return ($file_size, $processed_num);
218 }
219 }
220 my $result = namazu_core($cfile, $docid_count, $docid_base,
221 $file_count, $field_indices,
222 $fh_errorsfile, $total_files_num,
223 undef, undef, undef);
224 if ($result > 0) {
225 $processed_num++;
226 put_registry($cfile);
227 }
228
229 return ($file_size, $processed_num);
230 }
231
232 #
233 # Load mknmzrcs:
234 #
235 # 1. MKNMZRC environment
236 #
237 # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc
238 #
239 # 3. ~/.mknmzrc
240 #
241 # 4. user-specified mknmzrc set by mknmz --config=file option.
242 #
243 # If multiple files exists, read all of them.
244 #
245 sub load_rcfiles () {
246 my (@cand) = ();
247
248 # To support Windows. Since they have nasty drive letter convention,
249 # it is necessary to change mknmzrc dynamically with env. variable.
250 push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
251 push @cand, "$CONFDIR/mknmzrc";
252 push @cand, "$ENV{'HOME'}/.mknmzrc";
253
254 util::vprint(_("Reading rcfile: "));
255 for my $rcfile (@cand) {
256 if (-f $rcfile) {
257 load_rcfile ($rcfile);
258 util::vprint(" $rcfile");
259 }
260 }
261 util::vprint("\n");
262 }
263
264 sub load_rcfile ($) {
265 my ($rcfile) = @_;
266 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
267 util::win32_yen_to_slash(\$rcfile);
268 }
269 return if (grep {m/^$rcfile$/} @LoadedRcfiles);
270 do $rcfile;
271 if ($@) {
272 chop $@;
273 push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
274 }else {
275 push @LoadedRcfiles, $rcfile;
276 }
277
278 # Dirty workaround.
279 $LIBDIR = $conf::LIBDIR
280 if (defined $conf::LIBDIR && -d $conf::LIBDIR);
281 $FILTERDIR = $conf::FILTERDIR
282 if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
283 $TEMPLATEDIR = $conf::TEMPLATEDIR
284 if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
285 }
286
287 sub re_exec($$$$$$$$) {
288 my ($flist_ptr, $docid_count, $docid_base, $start_time,
289 $total_files_size, $total_files_num, $file_count, $key_count) = @_;
290
291 # store variables
292 {
293 my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
294
295 print $fh_checkpoint <<EOM;
296 \$DeletedFilesCount = $DeletedFilesCount;
297 \$UpdatedFilesCount = $UpdatedFilesCount;
298 \$APPENDMODE = $APPENDMODE;
299 \$flist_ptr = $flist_ptr;
300 \$docid_count = $docid_count;
301 \$docid_base = $docid_base;
302 \$start_time = $start_time;
303 \$total_files_size = $total_files_size;
304 \$total_files_num = $total_files_num;
305 \$key_count = $key_count;
306 \$file_count = $file_count;
307 \$\$ = $$;
308 EOM
309 util::fclose($fh_checkpoint);
310 }
311
312 @ARGV = ("-S", @ARGV) ;
313 print _("Checkpoint reached: re-exec mknmz...\n");
314 util::dprint(join ' ', ("::::", @ARGV, "\n"));
315 exec ($0, @ARGV) ;
316 }
317
318 sub put_registry ($) {
319 my ($filename) = @_;
320 my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
321 print $fh_registry $filename, "\n";
322 util::fclose($fh_registry);
323 }
324
325
326 # Initialization
327 # $CodingSystem: Character Coding System 'euc' or 'sjis'
328 sub init () {
329 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
330 $CodingSystem = "sjis";
331 if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
332 $CONFDIR = $1 . $CONFDIR ;
333 }
334 if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
335 $LIBDIR = $1 . $LIBDIR ;
336 }
337 if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
338 $FILTERDIR = $1 . $FILTERDIR ;
339 }
340 if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
341 $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
342 }
343 } else {
344 $CodingSystem = "euc";
345 }
346
347 $SIG{'INT'} = sub {
348 util::cdie("SIGINT caught! Aborted.\n");
349 };
350
351 $SIG{'TERM'} = sub {
352 print STDERR "SIGTERM caught!\n";
353 $ReceiveTERM = 1;
354 };
355 }
356
357 sub preload_modules () {
358 unshift @INC, $LIBDIR;
359 # workaround for test suites.
360 unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
361
362 require "var.pl" || die "unable to require \"var.pl\"\n";
363 require "conf.pl" || die "unable to require \"conf.pl\"\n";
364 require "util.pl" || die "unable to require \"util.pl\"\n";
365 require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
366 require "ext.pl" || die "unable to require \"ext.pl\"\n";
367 }
368
369 sub postload_modules () {
370 require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
371 }
372
373 sub load_modules () {
374 require "usage.pl" || die "unable to require \"usage.pl\"\n";
375 require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
376 require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
377 require "seed.pl" || die "unable to require \"seed.pl\"\n";
378 require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
379
380 @Seed = seed::init();
381 }
382
383 sub load_filtermodules () {
384 unshift @INC, $FILTERDIR;
385
386 #
387 # Windows modules must be loaded first.
388 # Because OLE filters have low precedence over normal ones.
389 #
390 load_win32modules() if $English::OSNAME eq "MSWin32";
391
392 # Check filter modules
393 my @filters = ();
394 @filters = glob "$FILTERDIR/*.pl";
395
396 load_filters(@filters);
397 }
398
399 sub load_win32modules () {
400 # Check filter modules
401 my @filters = ();
402 if (-f "../filter/win32/olemsword.pl") { # to ease developing
403 @filters = glob "../filter/win32/*.pl";
404 unshift @INC, "../filter/win32";
405 } else {
406 @filters = glob "$FILTERDIR/win32/*.pl";
407 unshift @INC, "$FILTERDIR/win32";
408 }
409
410 load_filters(@filters);
411 }
412
413 sub load_filters (@) {
414 my @filters = @_;
415
416 for my $filter (@filters) {
417 $filter =~ m!([-\w]+)\.pl$!;
418 my $module = $1;
419 require "$module.pl" || die "unable to require \"$module.pl\"\n";;
420 my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
421
422 eval "\@mtypes = ${module}::mediatype();";
423 die $@ if $@; # eval error
424 eval "\$status = ${module}::status();";
425 die $@ if $@;
426 eval "\$recursive = ${module}::recursive();";
427 die $@ if $@;
428 eval "\$pre_codeconv = ${module}::pre_codeconv();";
429 die $@ if $@;
430 eval "\$post_codeconv = ${module}::post_codeconv();";
431 die $@ if $@;
432 eval "${module}::add_magic(\$Magic);";
433 die $@ if $@;
434
435 for my $mt (@mtypes) {
436 next if (defined $var::Supported{$mt} &&
437 $var::Supported{$mt} eq 'yes' && $status eq 'no');
438 $var::Supported{$mt} = $status;
439 $var::REQUIRE_ACTIONS{$mt} = $module;
440 $var::RECURSIVE_ACTIONS{$mt} = $recursive;
441 $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
442 $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
443 }
444 }
445 }
446
447 # Core routine.
448 #
449 # FIXME: Too many parameters. They must be cleared.
450 #
451 sub namazu_core ($$$$$$$$$$) {
452 my ($cfile, $docid_count, $docid_base,
453 $file_count, $field_indices, $fh_errorsfile, $total_files_num,
454 $uri, $part_id, $part_num) = @_;
455
456 my $headings = "";
457 my $content = "";
458 my $weighted_str = "";
459 my %fields;
460 my $msg_prefix;
461
462 if ($part_id) {
463 $msg_prefix = " $part_id/$part_num - ";
464 } else {
465 $msg_prefix = $file_count + 1 . "/$total_files_num - ";
466 }
467
468 unless ($uri) {
469 $uri = generate_uri($cfile); # Make a URI from a file name.
470 }
471 my ($cfile_size, $text_size, $kanji, $mtype) =
472 load_document(\$cfile, \$content, \$weighted_str,
473 \$headings, \%fields);
474
475 {
476 $fields{'mtime'} = (stat($cfile))[9];
477 my $utc = $fields{'mtime'};
478 $utc = time::rfc822time_to_mtime($fields{'date'})
479 if (defined $fields{'date'});
480 if ($utc == -1) {
481 my $date = $fields{'date'};
482 print "$cfile Illegal date format. : $date\n";
483 print $fh_errorsfile "$cfile Illegal date format. : $date\n";
484 $utc = $fields{'mtime'};
485 delete $fields{'date'};
486 }
487 $fields{'utc'} = $utc;
488 }
489
490 util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
491
492 # Check if the file is acceptable.
493 my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
494 if (defined $err) {
495 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
496 my $uri2 = codeconv::eucjp_to_shiftjis($uri);
497 print $msg_prefix . "$uri2 $err\n";
498 } else {
499 print $msg_prefix . "$uri $err\n";
500 }
501 print $fh_errorsfile "$cfile $err\n";
502 return 0; # return 0 if error
503 }
504
505 # Print processing file name as URI.
506 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
507 my $uri2 = codeconv::eucjp_to_shiftjis($uri);
508 print $msg_prefix . "$uri2 [$mtype]\n";
509 } else {
510 print $msg_prefix . "$uri [$mtype]\n";
511 }
512
513 # Add filename.
514 my $filename = defined $cfile ? $cfile : '';
515 codeconv::toeuc(\$filename);
516 $filename = basename($filename);
517 $fields{'filename'} = $filename;
518
519 complete_field_info(\%fields, $cfile, $uri,
520 \$headings, \$content, \$weighted_str);
521 put_field_index(\%fields, $field_indices);
522
523 put_dateindex($cfile);
524 $content .= "\n\n$filename\n\n"; # add filename
525 $content .= $weighted_str; # add weights
526 count_words($docid_count, $docid_base, \$content, $kanji);
527 make_phrase_hash($docid_count, $docid_base, \$content);
528
529 # assertion
530 util::assert($cfile_size != 0,
531 "cfile_size == 0 at the end of namazu_core.");
532
533 return $cfile_size;
534 }
535
536 #
537 # Make the URI from the given file name.
538 #
539 sub generate_uri (@) {
540 my ($file, $fragment) = @_;
541 return "" unless defined $file;
542
543 # omit a file name if omittable
544 $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
545
546 if (defined $ReplaceCode) {
547 # transforming URI by evaling
548 $_ = $file;
549 eval $ReplaceCode;
550 $file = $_;
551 }
552
553 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
554 $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
555 }
556
557 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
558 $file = codeconv::shiftjis_to_eucjp($file);
559 }
560 if (defined $fragment) {
561 codeconv::toeuc(\$fragment);
562 }
563
564 unless ($var::Opt{'noencodeuri'}) {
565 for my $tmp ($file, $fragment) {
566 next unless defined $tmp;
567
568 # Escape unsafe characters (not strict)
569 $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2
570 $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
571 sprintf("%%%02X",ord($1))/ge;
572 }
573 }
574
575
576 my $uri = $file;
577 $uri .= "#" . $fragment if defined $fragment;
578 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
579 # restore '|' for drive letter rule of Win32, OS/2
580 $uri =~ s!^/([A-Z])%7C!/$1|!i;
581 }
582 return $uri;
583 }
584
585
586 sub get_field_index_base (\%) {
587 my ($field_indices) = @_;
588
589 my @keys = split('\|', $conf::SEARCH_FIELD);
590 if ($var::Opt{'meta'}) {
591 push @keys, (split '\|', $conf::META_TAGS);
592 }
593 for my $key (@keys) {
594 $key = lc($key);
595 my $fname = "$var::NMZ{'field'}.$key";
596 my $tmp_fname = util::tmpnam("NMZ.field.$key");
597 my $size = 0;
598 $size = -s $fname if -f $fname;
599 $size += -s $tmp_fname if -f $tmp_fname;
600 $field_indices->{$key} = $size;
601 }
602 }
603
604 sub complete_field_info (\%$$\$\$\$) {
605 my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
606
607 for my $field (keys %{$fields}) {
608 if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
609 delete $fields->{$field};
610 }
611 }
612
613 unless (defined($fields->{'title'})) {
614 $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
615 }
616 unless (defined($fields->{'date'})) {
617 my $mtime = $fields->{'mtime'};
618 my $date = util::rfc822time($mtime);
619 $fields->{'date'} = $date;
620 }
621 unless (defined($fields->{'uri'})) {
622 $fields->{'uri'} = $uri;
623 }
624 unless (defined($fields->{'size'})) {
625 $fields->{'size'} = -s $cfile;
626 }
627 unless (defined($fields->{'summary'})) {
628 $fields->{'summary'} = make_summary($contref, $headings, $cfile);
629 }
630 unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
631 $fields->{'from'} = getmsg("unknown");
632 }
633 }
634
635 #
636 # Currently, messages for NMZ.* files should be encoded in
637 # EUC-JP currently. We cannot use gettext.pl for the messsage
638 # because gettext.pl may use Shift_JIS encoded messages.
639 # So, we should use the function instead of gettext().
640 #
641 # FIXME: Ad hoc impl. getmsg() is effective only for "unknown".
642 #
643 sub getmsg($) {
644 my ($msg) = @_;
645
646 if (util::islang_msg("ja")) {
647 if ($msg eq "unknown") {
648 return "ÉÔÌÀ";
649 }
650 }
651 return $msg;
652 }
653
654 sub make_summary ($$$) {
655 my ($contref, $headings, $cfile) = @_;
656
657 # pick up $conf::MAX_FIELD_LENGTH bytes string
658 my $tmp = "";
659 if ($$headings ne "") {
660 $$headings =~ s/^\s+//;
661 $$headings =~ s/\s+/ /g;
662 $tmp = $$headings;
663 } else {
664 $tmp = "";
665 }
666
667 my $offset = 0;
668 my $tmplen = 0;
669 while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
670 && $offset < length($$contref))
671 {
672 $tmp .= substr $$contref, $offset, $tmplen;
673 $offset += $tmplen;
674 $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
675 $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
676 }
677
678 # -1 means "LF"
679 my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
680 # Remove a garbage Kanji 1st char at the end.
681 $summary = codeconv::chomp_eucjp($summary);
682
683 $summary =~ s/^\s+//;
684 $summary =~ s/\s+/ /g; # normalize white spaces
685
686 return $summary;
687 }
688
689
690 # output the field infomation into NMZ.fields.* files
691 sub put_field_index (\%$) {
692 my ($fields, $field_indices) = @_;
693
694 my $aliases_regex =
695 join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
696
697 for my $field (keys %{$fields}) {
698 util::dprint("Field: $field: $fields->{$field}\n");
699 if ($field =~ /^($aliases_regex)$/o) {
700 unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
701 $fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
702 }
703 undef $fields->{$field};
704 }
705 }
706
707 my @keys = split '\|', $conf::SEARCH_FIELD;
708 if ($var::Opt{'meta'}) {
709 my @meta = split '\|', $conf::META_TAGS;
710 while (my $meta = shift(@meta)) {
711 $meta = $conf::FIELD_ALIASES{$meta}
712 if (defined $conf::FIELD_ALIASES{$meta});
713
714 push @keys, $meta;
715 }
716
717 # uniq @keys
718 my %mark = ();
719 @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
720 }
721 for my $key (@keys) {
722 my $lkey = lc($key);
723 my $fname = util::tmpnam("NMZ.field.$lkey");
724 my $fh_field = util::efopen(">>$fname");
725 my $output = "";
726 if (defined($fields->{$key})) {
727 if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
728 $fields->{$key} =~ s/\s+/ /g;
729 $fields->{$key} =~ s/\s+$//;
730 $fields->{$key} =~ s/^\s+//;
731 }
732 $output = $fields->{$key};
733
734 # -1 means "LF"
735 $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
736 # Remove a garbage Kanji 1st char at the end.
737 $output = codeconv::chomp_eucjp($output);
738
739 $output =~ s/\n.*$//s;
740 $output .= "\n";
741 } else {
742 $output = "\n";
743 }
744 print $fh_field $output;
745 util::fclose($fh_field);
746
747 # put index of field index
748 {
749 my $fname = util::tmpnam("NMZ.field.$lkey.i");
750 my $fh_field_idx = util::efopen(">>$fname");
751 print $fh_field_idx pack("N", $field_indices->{$lkey});
752 $field_indices->{$lkey} += length $output;
753 util::fclose($fh_field_idx);
754 }
755 }
756
757 }
758
759 # put the date infomation into NMZ.t file
760 sub put_dateindex ($) {
761 my ($cfile) = @_;
762 my $mtime = (stat($cfile))[9];
763
764 my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
765 print $fh_dataindex pack("N", $mtime);
766 util::fclose($fh_dataindex);
767 }
768
769
770 # load a document file
771 sub load_document ($$$$\%) {
772 my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
773 = @_;
774 my $cfile = $$orig_cfile;
775
776 return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
777
778 # for handling a filename which contains Shift_JIS code for Windows.
779 # for handling a filename which contains including space.
780 my $shelter_cfile = "";
781 if (($cfile =~ /\s/) ||
782 ($English::OSNAME eq "MSWin32"
783 && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
784 {
785 $shelter_cfile = $cfile;
786 $cfile = util::tmpnam("NMZ.win32");
787 unlink $cfile if (-e $cfile);
788 copy($shelter_cfile, $cfile);
789 }
790
791 my $file_size;
792 $file_size = util::filesize($cfile); # not only file in feature.
793 if ($file_size > $conf::FILE_SIZE_MAX) {
794 return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
795 }
796
797 $$contref = util::readfile($cfile);
798 # $file_size = length($$contref);
799
800 my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
801
802 if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
803 unlink $cfile;
804 $cfile = $shelter_cfile;
805 }
806
807 # Measure the text size at this time.
808 my $text_size = length($$contref) + length($$weighted_str);
809
810 return ($file_size, $text_size, $kanji, $mtype);
811 }
812
813 sub apply_filter($$$$$$$) {
814 my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
815 = @_;
816 my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
817
818 # Filtering process.
819 my $mtype;
820 my $called_dt = 0;
821 while (1) {
822 if (defined $MediaType) {
823 $mtype = $MediaType;
824 } elsif (defined $mmtype) {
825 $mtype = $mmtype;
826 } else {
827 my $mtype_n = $Magic->checktype_byfilename($cfile);
828 my $mtype_c = $Magic->checktype_data($$contref);
829 my $mtype_m;
830 $mtype_m = $Magic->checktype_magic($$contref)
831 if ((! defined $mtype_c) ||
832 $mtype_c =~
833 /^(text\/html|text\/plain|application\/octet-stream)$/);
834 $mtype_c = $mtype_m
835 if (defined $mtype_m &&
836 $mtype_m !~
837 /^(text\/html|text\/plain|application\/octet-stream)$/);
838 $mtype_c = 'text/plain' unless defined $mtype_c;
839 if ($called_dt) {
840 $mtype = $mtype_c;
841 } else {
842 $mtype = decide_type($mtype_n, $mtype_c);
843 $called_dt = 1;
844 }
845 }
846 util::dprint(_("Detected type: ")."$mtype\n");
847
848 # Pre code conversion.
849 if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
850 util::dprint("pre_codeconv\n");
851 codeconv_document($contref);
852 }
853
854 if (! $var::Supported{$mtype} ||
855 $var::Supported{$mtype} ne 'yes')
856 {
857 util::vprint(_("Unsupported media type ")."$mtype\n");
858 return (0, "$mtype; x-system=unsupported");
859 }
860
861 if ($var::REQUIRE_ACTIONS{$mtype}) {
862 util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
863 require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
864 || die _("unable to require ") .
865 "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
866 my $err = undef;
867 {
868 local $SIG{'PIPE'} = \&trapintr;
869 eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
870 '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
871 }
872 if ($err) {
873 if ($err =~ m/; x-system=unsupported$/) {
874 return (0, $err);
875 }
876 return (0, "$mtype; x-error=$err");
877 }
878
879 if ($@) {
880 util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
881 return (0, "$mtype; x-error=$@");
882 }
883
884 # Post code conversion.
885 if ($var::REQUIRE_POST_CODECONV{$mtype}) {
886 util::dprint("post_codeconv\n");
887 codeconv_document($contref);
888 }
889
890 next if ($var::RECURSIVE_ACTIONS{$mtype});
891 }
892 last;
893 }
894
895 my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained?
896 $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
897
898 return ($kanji, $mtype);
899 }
900
901 sub codeconv_document ($) {
902 my ($textref) = @_;
903 codeconv::toeuc($textref);
904 $$textref =~ s/\r\n/\n/g;
905 $$textref =~ s/\r/\n/g;
906 $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
907 }
908
909 sub prep () {
910 my $docid_base = 0;
911 my $output_dir = shift @_ ;
912 my @targets = @_ ;
913 my @flist = ();
914
915 $var::OUTPUT_DIR = $output_dir;
916
917 require_modules();
918 change_filenames();
919 check_present_index();
920
921 # if Checkpoint mode, return
922 return (0, 0) if $CheckPoint{'continue'};
923
924 check_lockfile($var::NMZ{'lock2'});
925 print _("Looking for indexing files...\n");
926 @flist = find_target(@targets);
927 ($docid_base, @flist) = append_index(@flist)
928 if -f $var::NMZ{'r'};
929 unless (@flist) { # if @flist is empty
930 print _("No files to index.\n");
931 exit 0;
932 }
933 set_lockfile($var::NMZ{'lock2'});
934 save_flist(@flist);
935 my $total_files_num = @flist;
936
937 return ($docid_base, $total_files_num);
938 }
939
940 sub save_flist(@) {
941 my @flist = @_;
942 return if (@flist == 0);
943
944 my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
945 print $fh_flist join("\n", @flist), "\n";
946 util::fclose($fh_flist);
947 }
948
949 sub require_modules() {
950 if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
951 require NKF || die "unable to require \"NKF\"\n";
952 util::dprint(_("code conversion: using NKF module\n"));
953 $var::USE_NKF_MODULE = 1;
954 }
955 if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
956 require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
957 util::dprint(_("wakati: using Text::Kakasi module\n"));
958 my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
959 }
960 if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
961 require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
962 util::dprint(_("wakati: using Text::ChaSen module\n"));
963 my @arg = ('-i', 'e', '-j', '-F', '%m ');
964 @arg = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
965 my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
966 }
967 if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
968 require MeCab || die "unable to require \"MeCab\"\n";
969 util::dprint(_("wakati: using MeCab module\n"));
970 }
971 }
972
973 sub check_lockfile ($) {
974 # warn if check file exists in case other process is running or abnormal
975 # stop execution (later is not the major purpose, though).
976 # This is mainly for early detection before longish find_target.
977 my ($file) = @_;
978
979 if (-f $file) {
980 print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
981 exit 1;
982 }
983 }
984
985 sub set_lockfile ($) {
986 my ($file) = @_;
987
988 # make a lock file
989 if (-f $file) {
990 print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
991 exit 1;
992 } else {
993 my $fh_lockfile = util::efopen(">$file");
994 print $fh_lockfile "$$"; # save pid
995 util::fclose($fh_lockfile);
996 }
997 }
998
999 sub remove_lockfile ($) {
1000 my ($file) = @_;
1001
1002 # remove lock file
1003 unlink $file if -f $file;
1004 }
1005
1006 # check present index whether it is old type of not
1007 sub check_present_index () {
1008 if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}")
1009 {
1010 util::cdie(_("Present index is old type. it's unsupported.\n"));
1011 }
1012 }
1013
1014 # remain
1015 sub do_remain_job ($$$$) {
1016 my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
1017
1018 if ($docid_count == 0) {
1019 # No files are indexed
1020 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1021 update_dateindex();
1022 update_registry($docid_count);
1023 }
1024 } else {
1025 set_lockfile($var::NMZ{'lock'});
1026 write_version();
1027 write_body_msg();
1028 write_tips_msg();
1029 write_result_file();
1030 update_field_index();
1031 update_dateindex();
1032 update_registry($docid_count);
1033 write_nmz_files();
1034 make_slog_file();
1035 remove_lockfile($var::NMZ{'lock'});
1036 }
1037 make_headfoot_pages($docid_count, $key_count);
1038 put_log($total_files_size, $start_time, $docid_count, $key_count);
1039 util::remove_tmpfiles();
1040 unlink $var::NMZ{'_flist'};
1041 }
1042
1043 sub make_headfoot_pages($$) {
1044 my ($docid_count, $key_count) = @_;
1045
1046 for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
1047 if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
1048 my $suffix = $1 ? $1 : '';
1049 make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
1050 }
1051 }
1052 for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
1053 if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
1054 my $suffix = $1 ? $1 : '';
1055 make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
1056 }
1057 }
1058 }
1059
1060 # Parse command line options.
1061 sub parse_options
1062 {
1063 if (@ARGV == 0) {
1064 show_mini_usage();
1065 exit 1;
1066 }
1067
1068 my @targets = ();
1069 my $targets_loaded = 0;
1070 my @argv = @ARGV;
1071 my $cwd = cwd();
1072
1073 my $opt_dummy = 0;
1074 my $opt_version = 0;
1075 my $opt_help = 0;
1076 my $opt_all = 0;
1077 my $opt_chasen = 0;
1078 my $opt_chasen_noun = 0;
1079 my $opt_kakasi = 0;
1080 my $opt_mecab = 0;
1081 my $opt_checkpoint_sub = 0;
1082 my $opt_show_config = 0;
1083 my $opt_mailnews = 0;
1084 my $opt_mhonarc = 0;
1085 my $opt_norc = 0;
1086
1087 my $opt_quiet = undef;
1088 my $opt_config = undef;
1089 my $output_dir = undef;
1090 my $update_index = undef;
1091 my $include_file = undef;
1092 my $target_list = undef;
1093 my $index_lang = undef;
1094
1095 my %opt_conf;
1096
1097 # Getopt::Long::Configure('bundling');
1098 Getopt::Long::config('bundling');
1099 GetOptions(
1100 '0|help' => \$opt_help,
1101 '1|exclude=s' => \$opt_conf{'EXCLUDE_PATH'},
1102 '2|deny=s' => \$opt_conf{'DENY_FILE'},
1103 '3|allow=s' => \$opt_conf{'ALLOW_FILE'},
1104 '4|update=s' => \$update_index,
1105 '5|mhonarc' => \$opt_mhonarc,
1106 '6|mtime=s' => \$var::Opt{'mtime'},
1107 '7|html-split' => \$var::Opt{'htmlsplit'},
1108 'C|show-config' => \$opt_show_config,
1109 'E|no-edge-symbol' => \$var::Opt{'noedgesymbol'},
1110 'F|target-list=s' => \$target_list,
1111 'G|no-okurigana' => \$var::Opt{'okurigana'},
1112 'H|no-hiragana' => \$var::Opt{'hiragana'},
1113 'I|include=s' => \$include_file,
1114 'K|no-symbol' => \$var::Opt{'nosymbol'},
1115 'L|indexing-lang=s' => \$index_lang,
1116 'M|meta' => \$var::Opt{'meta'},
1117 'O|output-dir=s' => \$output_dir,
1118 'S|checkpoint-sub' => \$opt_checkpoint_sub,
1119 'T|template-dir=s' => \$TEMPLATEDIR,
1120 'U|no-encode-uri' => \$var::Opt{'noencodeuri'} ,
1121 'V|verbose' => \$var::Opt{'verbose'},
1122 'Y|no-delete' => \$var::Opt{'nodelete'},
1123 'Z|no-update' => \$var::Opt{'noupdate'},
1124 'a|all' => \$opt_all,
1125 'b|use-mecab' => \$opt_mecab,
1126 'c|use-chasen' => \$opt_chasen,
1127 'd|debug' => \$var::Opt{'debug'},
1128 'e|robots' => \$var::Opt{'robotexclude'},
1129 'f|config=s' => \$opt_config,
1130 'h|mailnews' => \$opt_mailnews,
1131 'k|use-kakasi' => \$opt_kakasi,
1132 'm|use-chasen-noun' => \$opt_chasen_noun,
1133 'q|quiet' => \$opt_quiet,
1134 'r|replace=s' => \$ReplaceCode,
1135 's|checkpoint' => \$CheckPoint{'on'},
1136 't|media-type=s' => \$MediaType,
1137 'u|uuencode' => \$opt_dummy, # for backward compat.
1138 'v|version' => \$opt_version,
1139 'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
1140 'z|check-filesize' => \$var::Opt{'checkfilesize'},
1141 'decode-base64' => \$var::Opt{'decodebase64'},
1142 'norc' => \$opt_norc,
1143 );
1144
1145 if ($opt_quiet) {
1146 # Make STDOUT quiet by redirecting STDOUT to null device.
1147 my $devnull = util::devnull();
1148 open(STDOUT, ">$devnull") || die "$devnull: $!";
1149 }
1150
1151 if (defined $update_index) {
1152 unless (-d $update_index) {
1153 print _("No such index: "), "$update_index\n";
1154 exit 1;
1155 }
1156
1157 my $orig_status = $var::NMZ{'status'};
1158 $var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
1159
1160 my $argv = get_status("argv");
1161 if (!defined $argv) {
1162 print _("No such index: "), "$update_index\n";
1163 exit 1;
1164 }
1165 @ARGV = split /\t/, $argv;
1166 util::dprint(_("Inherited argv: ")."@ARGV\n");
1167
1168 my $cwd = get_status("cwd");
1169 if (!defined $cwd) {
1170 print _("No such index: "), "$update_index\n";
1171 exit 1;
1172 }
1173 chdir $cwd;
1174 util::dprint(_("Inherited cwd: ")."$cwd\n");
1175
1176 ($output_dir, @targets) = parse_options();
1177 $output_dir = $update_index;
1178 $var::NMZ{'status'} = $orig_status; # See also change_filenames()
1179 return ($output_dir, @targets);
1180 }
1181
1182 if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
1183 load_rcfiles();
1184 }
1185 if ($opt_config) {
1186 if (-f $opt_config) {
1187 util::vprint(_("Reading rcfile: "));
1188 load_rcfile($ConfigFile = $opt_config);
1189 util::vprint(" $opt_config\n");
1190 }
1191 }
1192
1193 if ($index_lang) {
1194 $util::LANG = $index_lang;
1195 util::dprint("Override indexing language: $util::LANG\n");
1196 }
1197
1198 if ($opt_help) {
1199 show_usage();
1200 exit 1;
1201 }
1202
1203 if ($opt_version) {
1204 show_version();
1205 exit 1;
1206 }
1207
1208 load_filtermodules(); # to make effect $opt_config, $index_lang.
1209 postload_modules();
1210
1211 foreach my $key (keys %opt_conf){
1212 if (defined ($opt_conf{$key})) {
1213 ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
1214 }
1215 }
1216
1217 if ($opt_mailnews) {
1218 $MediaType = 'message/rfc822';
1219 }
1220 if ($opt_mhonarc) {
1221 $MediaType = 'text/html; x-type=mhonarc';
1222 }
1223 if ($opt_all) {
1224 $conf::ALLOW_FILE = ".*";
1225 }
1226 if ($opt_chasen) {
1227 $conf::WAKATI = $conf::CHASEN;
1228 $var::Opt{'noun'} = 0;
1229 }
1230 if ($opt_chasen_noun) {
1231 $conf::WAKATI = $conf::CHASEN_NOUN;
1232 $var::Opt{'noun'} = 1;
1233 }
1234 if ($opt_kakasi) {
1235 $conf::WAKATI = $conf::KAKASI;
1236 $var::Opt{'noun'} = 0;
1237 }
1238 if ($opt_mecab) {
1239 $conf::WAKATI = $conf::MECAB;
1240 $var::Opt{'noun'} = 0;
1241 }
1242 if ($include_file) {
1243 do $include_file;
1244 util::dprint("Included: $include_file\n");
1245 }
1246 if ($target_list) {
1247 if ($CheckPoint{'continue'}) {
1248 @targets = ("dummy");
1249 } else {
1250 @targets = load_target_list($target_list);
1251 util::dprint(_("Loaded: ")."$target_list\n");
1252 }
1253 $targets_loaded = 1;
1254 }
1255 if ($opt_checkpoint_sub) {
1256 $CheckPoint{'on'} = 1;
1257 $CheckPoint{'continue'} = 1;
1258 @argv = grep {! /^-S$/} @argv; # remove -S
1259 }
1260
1261 if (defined $ReplaceCode) {
1262 my $orig = "/foo/bar/baz/quux.html";
1263 $_ = $orig;
1264 eval $ReplaceCode;
1265 if ($@) { # eval error
1266 util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
1267 }
1268 util::dprint(_("Replace: ")."$orig -> $_\n");
1269 }
1270
1271 if ($opt_show_config) {
1272 show_config();
1273 exit 1;
1274 }
1275
1276 if (@ARGV == 0 && $targets_loaded == 0) {
1277 show_mini_usage();
1278 exit 1;
1279 }
1280
1281 $output_dir = $cwd unless defined $output_dir;
1282 util::cdie("$output_dir: "._("invalid output directory\n"))
1283 unless (-d $output_dir && -w $output_dir);
1284
1285 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1286 util::win32_yen_to_slash(\$output_dir);
1287 }
1288
1289 # take remaining @ARGV as targets
1290 if (@ARGV > 0 && $targets_loaded == 0) {
1291 @targets = @ARGV ;
1292 }
1293
1294 # revert @ARGV
1295 # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
1296 @ARGV = @argv;
1297
1298 return ($output_dir, @targets);
1299 }
1300
1301 sub show_config () {
1302 print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
1303 print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
1304 print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
1305 print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
1306 print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
1307 print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
1308 print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
1309 print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
1310 print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
1311 print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
1312 print _("Lang_Msg: ") . "$util::LANG_MSG\n";
1313 print _("Lang: ") . "$util::LANG\n";
1314 print _("Coding System: ") . "$CodingSystem\n";
1315 print _("CONFDIR: ") . "$CONFDIR\n";
1316 print _("LIBDIR: ") . "$LIBDIR\n";
1317 print _("FILTERDIR: ") . "$FILTERDIR\n";
1318 print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
1319
1320 my @all_types = keys %var::Supported;
1321 my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
1322
1323 my $num_supported = @supported;
1324 my $num_unsupported = @all_types - @supported;
1325 print _("Supported media types: ") . "($num_supported)\n";
1326 print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
1327 for my $mtype (sort keys %var::Supported) {
1328 my $yn = $var::Supported{$mtype};
1329 if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
1330 print "$yn $mtype";
1331 if ($var::REQUIRE_ACTIONS{$mtype}){
1332 print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
1333 }
1334 print "\n";
1335 }
1336 }
1337
1338 sub load_target_list ($) {
1339 my ($file) = @_;
1340 my $fh_targets = util::efopen($file);
1341 my @targets = <$fh_targets>;
1342 util::fclose($fh_targets);
1343 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1344 foreach my $tmp (@targets){
1345 $tmp =~ s/\r//g;
1346 util::win32_yen_to_slash(\$tmp);
1347 }
1348 }
1349 chomp @targets;
1350 return @targets;
1351 }
1352
1353 # convert a relative path into an absolute path
1354 sub absolute_path($$) {
1355 my ($cwd, $path) = @_;
1356
1357 $path =~ s!^\.$!\./!;
1358 $path =~ s!^\.[/\\]!$cwd/!;
1359 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1360 util::win32_yen_to_slash(\$path);
1361 if ($path =~ m!^//!) {
1362 } elsif ($path =~ m!^/[^/]!) {
1363 my $driveletter = $cwd;
1364 if ($driveletter =~ m!^([A-Z]:)!i){
1365 $driveletter = $1;
1366 }
1367 $path = "$driveletter$path";
1368 } elsif ($path !~ m!^[A-Z]:/!i) {
1369 $path = "$cwd/$path";
1370 }
1371 } else {
1372 $path =~ s!^([^/])!$cwd/$1!;
1373 }
1374 return $path;
1375 }
1376
1377 sub find_target (@) {
1378 my @targets = @_;
1379
1380 my $cwd = cwd();
1381 @targets = map { absolute_path($cwd, $_) } @targets;
1382
1383 # Convert \ to / with consideration for Shift_JIS encoding.
1384 if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1385 foreach my $tmp (@targets){
1386 util::win32_yen_to_slash(\$tmp);
1387 }
1388 }
1389
1390 # For reporting effects of --allow, --deny, --exclude, --mtime
1391 # options in --verbose mode.
1392 my %counts = ();
1393 $counts{'possible'} = 0;
1394 $counts{'excluded'} = 0;
1395 $counts{'too_old'} = 0;
1396 $counts{'too_new'} = 0;
1397 $counts{'not_allowed'} = 0;
1398 $counts{'denied'} = 0;
1399
1400 # Traverse directories.
1401 # This routine is not efficent but I prefer reliable logic.
1402 my @flist = ();
1403 my $start = time();
1404 util::vprint(_("find_target starting: "). localtime($start). "\n");
1405 while (@targets) {
1406 my $target = shift @targets;
1407
1408 if ($target eq '') {
1409 print STDERR "Warning: target contains empty line, skip it\n";
1410 next;
1411 }
1412
1413 if (-f $target) { # target is a file.
1414 add_target($target, \@flist, \%counts);
1415 } elsif (-d $target) { # target is a directory.
1416 my @subtargets = ();
1417 # Find subdirectories in target directory
1418 # because File::Find::find() does not follow symlink.
1419 if (-l $target) {
1420 my $dh = new DirHandle($target);
1421 while (defined(my $ent = $dh->read)) {
1422 next if ($ent =~ /^\.{1,2}$/);
1423 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1424 next if ($ent =~ m!^($conf::DENY_DDN)$!i);
1425 my $tmp = $ent;
1426 util::win32_yen_to_slash(\$tmp);
1427 next if ($ent ne $tmp);
1428 }
1429 my $fname = "$target/$ent";
1430 next if ($fname eq '.' || $fname eq '..');
1431 if (-d $fname) {
1432 push(@subtargets, $fname);
1433 } else {
1434 add_target($fname, \@flist, \%counts);
1435 }
1436 }
1437 } else {
1438 @subtargets = ($target);
1439 }
1440
1441 #
1442 # Wanted routine for File::Find's find().
1443 #
1444 my $wanted_closure = sub {
1445 my $fname = "$File::Find::dir/$_";
1446 add_target($fname, \@flist, \%counts);
1447 };
1448
1449 find($wanted_closure, @subtargets) if (@subtargets > 0);
1450 } else {
1451 print STDERR _("unsupported target: ") . $target;
1452 }
1453 }
1454
1455 # uniq @flist
1456 my %mark = ();
1457 @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
1458
1459 # Sort file names with consideration for numbers.
1460 @flist = map { $_->[0] }
1461 sort { $a->[1] cmp $b->[1] }
1462 map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
1463 [ $_, $tmp ] } @flist;
1464
1465 my $elapsed = time() - $start ;
1466 $elapsed += 1 ; # to round up and avoid 0
1467
1468 # For --verbose option.
1469 report_find_target($elapsed, $#flist + 1, %counts);
1470
1471 return @flist;
1472 }
1473
1474 sub add_target ($\@\%) {
1475 my ($target, $flists_ref, $counts_ref) = @_;
1476
1477 if ($target =~ /[\n\r\t]/) {
1478 $target =~ s/[\n\r\t]//g;
1479 print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
1480 return; # skip a file name containing LF/CR/TAB chars.
1481 }
1482
1483 return unless -f $target; # Only file is targeted.
1484
1485 $counts_ref->{'possible'}++;
1486
1487 unless (util::canopen($target)) {
1488 util::vprint(sprintf(_("Unreadable: %s"), $target));
1489 $counts_ref->{'excluded'}++;
1490 return;
1491 }
1492
1493
1494 if (defined $conf::EXCLUDE_PATH &&
1495 $target =~ /$conf::EXCLUDE_PATH/ )
1496 {
1497 util::vprint(sprintf(_("Excluded: %s"), $target));
1498 $counts_ref->{'excluded'}++;
1499 return;
1500 }
1501
1502 #
1503 # Do processing just like find's --mtime option.
1504 #
1505 if (defined $var::Opt{'mtime'}) {
1506 my $mtime = -M $_;
1507 if ($var::Opt{'mtime'} < 0) {
1508
1509 # This must be `>=' not `>' for consistency with find(1).
1510 if (int($mtime) >= - $var::Opt{'mtime'}) {
1511 util::vprint(sprintf(_("Too old: %s"), $target));
1512 $counts_ref->{'too_old'}++;
1513 return;
1514 }
1515 } elsif ($var::Opt{'mtime'} > 0) {
1516 if ($var::Opt{'mtime'} =~ /^\+/) {
1517 if ((int($mtime) < $var::Opt{'mtime'})) {
1518 util::vprint(sprintf(_("Too new: %s"), $target));
1519 $counts_ref->{'too_new'}++;
1520 return;
1521 }
1522 } else {
1523 if (int($mtime) != $var::Opt{'mtime'}) {
1524 if (int($mtime) > $var::Opt{'mtime'}) {
1525 util::vprint(sprintf(_("Too old: %s"),$target));
1526 $counts_ref->{'too_old'}++;
1527 } else {
1528 util::vprint(sprintf(_("Too new: %s"),$target));
1529 $counts_ref->{'too_new'}++;
1530 }
1531 return;
1532 }
1533 }
1534 } else {
1535 # $var::Opt{'mtime'} == 0 ;
1536 return;
1537 }
1538 }
1539
1540 # Extract the file name of the target.
1541 $target =~ m!^.*/([^/]+)$!;
1542 my $fname = $1;
1543
1544 if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
1545 util::vprint(sprintf(_("Denied: %s"), $target));
1546 $counts_ref->{'denied'}++;
1547 return;
1548 }
1549 if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
1550 util::vprint(sprintf(_("Not allowed: %s"), $target));
1551 $counts_ref->{'not_allowed'}++;
1552 return;
1553 } else{
1554 util::vprint(sprintf(_("Targeted: %s"), $target));
1555 push @$flists_ref, $target;
1556 }
1557
1558 }
1559
1560 sub report_find_target ($$%) {
1561 my ($elapsed, $num_targeted, %counts) = @_;
1562
1563 util::vprint(_("find_target finished: ") . localtime(time()). "\n");
1564 util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
1565 $num_targeted, $elapsed,
1566 $num_targeted /$elapsed));
1567 util::vprint(sprintf(_(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
1568 $counts{'possible'},
1569 $counts{'not_allowed'},
1570 $counts{'denied'},
1571 $counts{'excluded'}));
1572 util::vprint(sprintf(_(" MTIME too old: %d, MTIME too new: %d"),
1573 $counts{'too_old'},
1574 $counts{'too_new'}));
1575 }
1576
1577 sub show_usage () {
1578 util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
1579 util::dprint(_("lang: ")."$util::LANG\n");
1580
1581 my $usage = $usage::USAGE;
1582 $usage = _($usage);
1583 printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
1584 }
1585
1586 sub show_mini_usage () {
1587 print _("Usage: mknmz [options] <target>...\n");
1588 print _("Try `mknmz --help' for more information.\n");
1589 }
1590
1591 sub show_version () {
1592 print $usage::VERSION_INFO;
1593 }
1594
1595 #
1596 # check the file. No $msg is good.
1597 #
1598 sub check_file ($$$$$) {
1599 my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
1600
1601 my $msg = undef;
1602 if ($mtype =~ /; x-system=unsupported$/) {
1603 $mtype =~ s/; x-system=unsupported$//;
1604 $msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
1605 } elsif ($mtype =~ /; x-error=file_size_max/) {
1606 $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1607 } elsif ($mtype =~ /; x-error=.*$/) {
1608 $mtype =~ s/^.*; x-error=(.*)$/$1/;
1609 $msg = $mtype;
1610 } elsif ($mtype =~ /^x-system/) {
1611 $msg = _("system error occurred! ")."($mtype)"._(" skipped.");
1612 } elsif (! -e $cfile) {
1613 $msg = _("does NOT EXIST! skipped.");
1614 } elsif (! util::canopen($cfile)) {
1615 $msg = _("is NOT READABLE! skipped.");
1616 } elsif ($text_size == 0 || $cfile_size == 0) {
1617 $msg = _("is 0 size! skipped.");
1618 } elsif ($mtype =~ /^application\/octet-stream/) {
1619 $msg = _("may be a BINARY file! skipped.");
1620 } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
1621 $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1622 } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
1623 $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
1624 }
1625
1626 return $msg;
1627 }
1628
1629
1630 #
1631 # Write NMZ.version file.
1632 #
1633 sub write_version() {
1634 unless (-f $var::NMZ{'version'}) {
1635 my $fh = util::efopen(">$var::NMZ{'version'}");
1636 print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
1637 util::fclose($fh);
1638 }
1639 }
1640
1641 #
1642 # rename each temporary file to a real file name.
1643 #
1644 sub write_nmz_files () {
1645 util::Rename($var::NMZ{'_i'}, $var::NMZ{'i'});
1646 util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
1647 util::Rename($var::NMZ{'_w'}, $var::NMZ{'w'});
1648 util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
1649 util::Rename($var::NMZ{'_p'}, $var::NMZ{'p'});
1650 util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
1651 }
1652
1653 # output NMZ.body
1654 sub write_body_msg () {
1655 for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
1656 if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
1657 my $suffix = $1 ? $1 : '';
1658 write_message("$var::NMZ{'body'}${suffix}");
1659 }
1660 }
1661 }
1662
1663 # output NMZ.tips
1664 sub write_tips_msg () {
1665 for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
1666 if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
1667 my $suffix = $1 ? $1 : '';
1668 write_message("$var::NMZ{'tips'}${suffix}");
1669 }
1670 }
1671 }
1672
1673
1674 # output NMZ.result.*
1675 sub write_result_file () {
1676 my $fname = "NMZ.result.normal";
1677
1678 my @files = glob "$TEMPLATEDIR/NMZ.result.*";
1679
1680 for my $file (@files) {
1681 $file =~ m!(NMZ\.result\.[^/]*)$!;
1682 my $target = "$var::OUTPUT_DIR/$1";
1683 if (-f $target) { # already exist;
1684 next;
1685 } else {
1686 my $buf = util::readfile($file);
1687 my $fh_file = util::efopen(">$target");
1688 print $fh_file $buf;
1689 util::fclose($fh_file);
1690 }
1691 }
1692 }
1693
1694 # write NMZ.body and etc.
1695 sub write_message ($) {
1696 my ($msgfile) = @_;
1697
1698 if (! -f $msgfile) {
1699 my ($template, $fname);
1700
1701 $msgfile =~ m!.*/(.*)$!;
1702 $fname = $1;
1703 $template = "$TEMPLATEDIR/$fname";
1704
1705 if (-f $template) {
1706 my $buf = util::readfile($template);
1707 my $fh_output = util::efopen(">$msgfile");
1708 print $fh_output $buf;
1709 util::fclose($fh_output);
1710 }
1711 }
1712 }
1713
1714
1715 #
1716 # Make the NMZ.slog file for logging.
1717 #
1718 sub make_slog_file () {
1719 if (! -f $var::NMZ{'slog'}) {
1720 my $fh = util::efopen(">$var::NMZ{'slog'}");
1721 util::fclose($fh);
1722 undef $fh;
1723 chmod 0666, $var::NMZ{'slog'};
1724 }
1725 {
1726 my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
1727 util::fclose($fh_slogfile);
1728 }
1729 }
1730
1731
1732 #
1733 # Concatenate $CURRENTDIR to the head of each file.
1734 #
1735 sub change_filenames ($) {
1736 my $dir = $var::OUTPUT_DIR;
1737
1738 for my $key (sort keys %var::NMZ) {
1739 next if $key =~ /^_/; # exclude temporary file
1740 $var::NMZ{$key} = "$dir/$var::NMZ{$key}";
1741 }
1742
1743 # temporary files
1744 for my $key (sort keys %var::NMZ) {
1745 if ($key =~ /^_/) {
1746 $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
1747 }
1748 }
1749
1750 if ($var::Opt{'debug'}) {
1751 for my $key (sort keys %var::NMZ) {
1752 util::dprint("NMZ: $var::NMZ{$key}\n");
1753 }
1754 }
1755 }
1756
1757
1758 #
1759 # Preparation processing for appending index files.
1760 #
1761 sub append_index (@) {
1762 my @flist = @_;
1763
1764 my $docid_base = 0;
1765 ($docid_base, @flist) = set_target_files(@flist);
1766
1767 unless (@flist) { # if @flist is empty
1768 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1769 set_lockfile($var::NMZ{'lock2'});
1770 update_dateindex();
1771 update_registry(0);
1772 make_headfoot_pages(0, get_total_keys());
1773 put_log(0, 0, 0, get_total_keys());
1774 make_headfoot_pages(get_status("files"), get_status("keys"));
1775 util::remove_tmpfiles();
1776 }
1777 print _("No files to index.\n");
1778 exit 0;
1779 }
1780
1781 $APPENDMODE = 1;
1782 # conserve files by copying
1783 copy($var::NMZ{'i'}, $var::NMZ{'_i'});
1784 copy($var::NMZ{'w'}, $var::NMZ{'_w'});
1785 copy($var::NMZ{'t'}, $var::NMZ{'_t'})
1786 unless -f $var::NMZ{'_t'}; # preupdated ?
1787 copy($var::NMZ{'p'}, $var::NMZ{'_p'});
1788 copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
1789
1790 return ($docid_base, @flist);
1791 }
1792
1793 #
1794 # Set target files to @flist and return with the number of regiested files.
1795 #
1796 sub set_target_files() {
1797 my %rdocs; # 'rdocs' means 'registered documents'
1798 my @found_files = @_;
1799
1800 # Load the list of registered documents
1801 $rdocs{'name'} = load_registry();
1802
1803 # Pick up overlapped documents and do marking
1804 my %mark1;
1805 my @overlapped_files;
1806 grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
1807 $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
1808 for my $overlapped (grep { $mark1{$_} } @found_files) {
1809 $rdocs{'overlapped'}{$overlapped} = 1;
1810 push @overlapped_files, $overlapped;
1811 };
1812
1813 # Pick up not overlapped documents which are files to index.
1814 my @flist = grep { ! $mark1{$_} } @found_files;
1815
1816 if ($var::Opt{'noupdate'}) {
1817 return (scalar @{$rdocs{'name'}}, @flist);
1818 };
1819
1820 # Load the date index.
1821 $rdocs{'mtime'} = load_dateindex();
1822
1823 if (@{$rdocs{'mtime'}} == 0) {
1824 return (scalar @{$rdocs{'name'}}, @flist);
1825 };
1826
1827 util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
1828 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
1829
1830 # Pick up deleted documents and do marking
1831 # (registered in the NMZ.r but not existent in the filesystem)
1832 my @deleted_documents;
1833 unless ($var::Opt{'nodelete'}) {
1834 my %mark2;
1835 grep { $mark2{$_}++ } @found_files;
1836 for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
1837 ! $rdocs{'overlapped'}{$_} }
1838 @{$rdocs{'name'}})
1839 {
1840 $rdocs{'deleted'}{$deleted} = 1;
1841 push @deleted_documents, $deleted;
1842 }
1843 }
1844
1845 # check filesize
1846 if ($var::Opt{'checkfilesize'}) {
1847 $rdocs{'size'} = load_sizefield();
1848 }
1849
1850 # Pick up updated documents and set the missing number for deleted files.
1851 my @updated_documents = pickup_updated_documents(\%rdocs);
1852
1853 # Append updated files to the list of files to index.
1854 if (@updated_documents) {
1855 push @flist, @updated_documents;
1856 }
1857
1858 # Remove duplicates.
1859 my %seen = ();
1860 @flist = grep { ! $seen{$_}++ } @flist;
1861
1862 util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
1863 util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
1864 util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
1865 util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
1866 util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
1867 util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
1868
1869 # Update NMZ.t with the missing number infomation and
1870 # append updated files and deleted files to NMZ.r with leading '# '
1871 if (@updated_documents || @deleted_documents) {
1872 $DeletedFilesCount = 0;
1873 $UpdatedFilesCount = 0;
1874 $UpdatedFilesCount += @updated_documents;
1875 # $DeletedFilesCount += @updated_documents;
1876 $DeletedFilesCount += @deleted_documents;
1877 preupdate_dateindex(@{$rdocs{'mtime'}});
1878 preupdate_registry(@updated_documents, @deleted_documents);
1879 }
1880
1881 # Return the number of registered documents and list of files to index.
1882 return (scalar @{$rdocs{'name'}}, @flist);
1883 }
1884
1885 sub preupdate_registry(@) {
1886 my (@list) = @_;
1887
1888 my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
1889 @list = grep { s/(.*)/\# $1\n/ } @list;
1890 print $fh_registry @list;
1891 print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
1892 util::fclose($fh_registry);
1893 }
1894
1895 sub preupdate_dateindex(@) {
1896 my @mtimes = @_;
1897
1898 # Since rewriting the entire file, it is not efficient,
1899 # but simple and reliable. this would be revised in the future.
1900 my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
1901 # print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
1902 print $fh_dateindex pack("N*", @mtimes);
1903 util::fclose($fh_dateindex);
1904 }
1905
1906 sub update_registry ($) {
1907 my ($docid_count) = @_;
1908
1909 {
1910 my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
1911 my $fh_registry_ = util::efopen($var::NMZ{'_r'});
1912 while (defined(my $line = <$fh_registry_>)) {
1913 print $fh_registry $line;
1914 }
1915 if ($docid_count > 0) {
1916 print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
1917 }
1918 util::fclose($fh_registry_) if (defined $fh_registry_);
1919 util::fclose($fh_registry);
1920 }
1921 unlink $var::NMZ{'_r'};
1922 }
1923
1924 sub update_dateindex () {
1925 util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
1926 }
1927
1928 sub update_field_index () {
1929 my @list = glob "$var::NMZ{'field'}.*.tmp";
1930 for my $tmp (@list) {
1931 if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
1932 my $fname_tmp = $1;
1933 my $fname_out = $2;
1934 {
1935 my $fh_field = util::efopen(">>$fname_out");
1936 my $fh_tmp = util::efopen($fname_tmp);
1937
1938 while (defined(my $line = <$fh_tmp>)) {
1939 print $fh_field $line;
1940 }
1941 util::fclose($fh_tmp) if (defined $fh_tmp);
1942 util::fclose($fh_field);
1943 }
1944 unlink $fname_tmp;
1945 } else {
1946 util::cdie(_("update_field_index: ")."@list");
1947 }
1948 }
1949 }
1950
1951 sub pickup_updated_documents (\%) {
1952 my ($rdocs_ref) = @_;
1953 my @updated_documents = ();
1954
1955 # To avoid duplicated outputs caused by --html-split support.
1956 my %printed = ();
1957 my $i = 0;
1958 for my $cfile (@{$rdocs_ref->{'name'}}) {
1959 if (defined($rdocs_ref->{'deleted'}{$cfile})) {
1960 unless ($printed{$cfile}) {
1961 print "$cfile " . _("was deleted!\n");
1962 $printed{$cfile} = 1;
1963 }
1964 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1965 } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
1966 my $cfile_mtime = (stat($cfile))[9];
1967 my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
1968 my ($cfile_size, $rfile_size);
1969 if ($var::Opt{'checkfilesize'}) {
1970 $cfile_size = (stat($cfile))[7];
1971 $rfile_size = $rdocs_ref->{'size'}[$i];
1972 }
1973
1974 if ($rfile_mtime != $cfile_mtime ||
1975 ($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
1976 # The file is updated!
1977 unless ($printed{$cfile}) {
1978 print "$cfile " . _("was updated!\n");
1979 $printed{$cfile} = 1;
1980 }
1981 push(@updated_documents, $cfile);
1982 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1983 }
1984 }
1985 $i++;
1986 }
1987
1988 return @updated_documents
1989 }
1990
1991 sub load_dateindex() {
1992 my $fh_dateindex = util::efopen($var::NMZ{'t'});
1993
1994 my $size = -s $var::NMZ{'t'};
1995 my $buf = "";
1996 read($fh_dateindex, $buf, $size);
1997 my @list = unpack("N*", $buf); # load date index
1998 # print "\nload_dateindex\n", join("\n", @list), "\n\n";
1999
2000 util::fclose($fh_dateindex);
2001 return [ @list ];
2002 }
2003
2004 sub load_registry () {
2005 my $fh_registry = util::efopen($var::NMZ{'r'});
2006
2007 my @list = ();
2008 my %deleted = ();
2009 my @registered = ();
2010
2011 while (defined(my $line = <$fh_registry>)) {
2012 chomp($line);
2013 next if $line =~ /^\s*$/; # an empty line
2014 next if $line =~ /^##/; # a comment
2015 if ($line =~ s/^\#\s+//) { # deleted document
2016 $deleted{$line}++;
2017 } else {
2018 # Remove HTML's anchor generated by --html-split option.
2019 $line =~ s/\t.*$//g;
2020 push @registered, $line;
2021 }
2022 }
2023
2024 util::fclose($fh_registry) if (defined $fh_registry);
2025
2026 # Exclude deleted documents.
2027 for my $doc (@registered) {
2028 if ($deleted{$doc}) {
2029 push @list, "# $doc";
2030 $deleted{$doc}--;
2031 } else {
2032 push @list, $doc;
2033 }
2034 }
2035
2036 return [ @list ];
2037 }
2038
2039 # get file size information from NMZ.field.size
2040 sub load_sizefield() {
2041 my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
2042 return [] unless defined $fh_sizefield;
2043 my $line;
2044 my @ret = ();
2045 while (defined($line = <$fh_sizefield>)) {
2046 chomp $line;
2047 push @ret, $line;
2048 }
2049 util::fclose($fh_sizefield) if (defined $fh_sizefield);
2050 return \@ret;
2051 }
2052
2053 sub get_total_keys() {
2054 my $keys = get_status("keys");
2055 $keys =~ s/,//g if (defined $keys);
2056 $keys = 0 unless defined $keys;
2057 return $keys;
2058 }
2059
2060 sub get_total_files() {
2061 my $files = get_status("files");
2062 $files =~ s/,//g if (defined $files);
2063 $files = 0 unless defined $files;
2064 return $files;
2065 }
2066
2067 sub get_status($) {
2068 my ($key) = @_;
2069
2070 my $fh = util::fopen($var::NMZ{'status'});
2071 return undef unless defined $fh;
2072
2073 while (defined(my $line = <$fh>)) {
2074 if ($line =~ /^$key\s+(.*)$/) {
2075 util::dprint("status: $key = $1\n");
2076 $fh->close;
2077 return $1;
2078 }
2079 }
2080 util::fclose($fh) if (defined $fh);
2081 return undef;
2082 }
2083
2084 sub put_total_files($) {
2085 my ($number) = @_;
2086 $number =~ tr/,//d;
2087 put_status("files", $number);
2088 }
2089
2090 sub put_total_keys($) {
2091 my ($number) = @_;
2092 $number =~ tr/,//d;
2093 put_status("keys", $number);
2094 }
2095
2096 sub put_status($$) {
2097 my ($key, $value) = @_;
2098
2099 # remove NMZ.status file if the file has a previous value.
2100 unlink $var::NMZ{'status'} if defined get_status($key);
2101
2102 my $fh = util::efopen(">> $var::NMZ{'status'}");
2103 print $fh "$key $value\n";
2104 util::fclose($fh);
2105 }
2106
2107 # do logging
2108 sub put_log ($$$$) {
2109 my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
2110
2111 my $date = localtime;
2112 my $added_files_count = $docid_count;
2113 my $deleted_documents_count = $DeletedFilesCount;
2114 my $updated_documents_count = $UpdatedFilesCount;
2115 my $total_files_count = get_total_files() + $docid_count
2116 - $DeletedFilesCount - $UpdatedFilesCount;
2117 my $added_keys_count = 0;
2118 $added_keys_count = $total_keys_count - get_total_keys();
2119
2120 my $processtime = time - $start_time;
2121 $processtime = 0 if $start_time == 0;
2122 $total_files_size = $total_files_size;
2123 $total_keys_count = $total_keys_count;
2124
2125 my @logmsgs = ();
2126 if ($APPENDMODE) {
2127 push @logmsgs, N_("[Append]");
2128 } else {
2129 push @logmsgs, N_("[Base]");
2130 }
2131 push @logmsgs, N_("Date:"), "$date" if $date;
2132 push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
2133 if $added_files_count;
2134 push @logmsgs, N_("Deleted Documents:"),
2135 util::commas("$deleted_documents_count") if $deleted_documents_count;
2136 push @logmsgs, N_("Updated Documents:"),
2137 util::commas("$updated_documents_count") if $updated_documents_count;
2138 push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
2139 if $total_files_size;
2140 push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count")
2141 if $total_files_count;
2142 push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count")
2143 if $added_keys_count;
2144 push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count")
2145 if $total_keys_count;
2146 push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
2147 push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
2148 if $processtime;
2149 push @logmsgs, N_("File/Sec:"), sprintf "%.2f",
2150 (($added_files_count + $updated_documents_count) / $processtime)
2151 if $processtime;
2152 push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
2153 push @logmsgs, N_("Perl:"), sprintf("%f", $English::PERL_VERSION);
2154 push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
2155
2156 my $log_for_file = "";
2157
2158 my $msg = shift @logmsgs; # [Base] or [Append]
2159 # To stdout, use gettext.
2160 print _($msg), "\n";
2161 # To log file, do not use gettext.
2162 $log_for_file = $msg . "\n";
2163 while (@logmsgs) {
2164 my $field = shift @logmsgs;
2165 my $value = shift @logmsgs;
2166 printf "%-20s %s\n", _($field), "$value";
2167 $log_for_file .= sprintf "%-20s %s\n", $field, "$value";
2168 }
2169 print "\n";
2170 $log_for_file .= "\n";
2171
2172 put_log_to_logfile($log_for_file);
2173 put_total_files($total_files_count);
2174 put_total_keys($total_keys_count);
2175
2176 my $argv = join "\t", @ARGV;
2177 my $cwd = cwd();
2178 put_status("argv", $argv);
2179 put_status("cwd", $cwd);
2180 }
2181
2182 sub put_log_to_logfile ($) {
2183 my ($logmsg) = @_;
2184 my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
2185 print $fh_logfile $logmsg;
2186 util::fclose($fh_logfile);
2187 }
2188
2189 sub get_year() {
2190 my $year = (localtime)[5] + 1900;
2191
2192 return $year;
2193 }
2194
2195 # Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
2196 # Insert $docid_count, $key_count, and $month/$day/$year respectively.
2197 sub make_headfoot ($$$) {
2198 my ($file, $docid_count, $key_count) = @_;
2199
2200 my $day = sprintf("%02d", (localtime)[3]);
2201 my $month = sprintf("%02d", (localtime)[4] + 1);
2202 my $year = get_year();
2203 my $buf = "";
2204
2205 if (-f $file) {
2206 $buf = util::readfile($file);
2207 } else {
2208 $file =~ m!.*/(.*)$!;
2209 my $fname = $1;
2210 my $template = "$TEMPLATEDIR/$fname";
2211
2212 if (-f $template) {
2213 $buf = util::readfile($template);
2214 } else {
2215 return;
2216 }
2217 }
2218
2219 my $fh_file = util::efopen(">$file");
2220
2221 if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
2222 my $total_files_count = util::commas(get_total_files() + $docid_count
2223 - $DeletedFilesCount - $UpdatedFilesCount);
2224 $buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
2225
2226 }
2227 if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
2228 my $tmp = $2;
2229 $tmp =~ tr/,//d;
2230 $tmp = $key_count;
2231 $tmp = util::commas($tmp);
2232 $buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
2233 }
2234 my $index_dir = basename($var::OUTPUT_DIR);
2235 $buf =~ s#<!-- INDEX_DIR -->#$index_dir#gs;
2236 $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
2237 $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
2238 $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
2239 {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
2240 $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
2241 {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
2242
2243 print $fh_file $buf;
2244 util::fclose($fh_file);
2245 }
2246
2247 # Make phrase hashes for NMZ.p
2248 # Handle two words each for calculating a hash value ranged 0-65535.
2249 sub make_phrase_hash ($$$) {
2250 my ($docid_count, $docid_base, $contref) = @_;
2251
2252 my %tmp = ();
2253 $$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight
2254 $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
2255 my @words = split(/\s+/, $$contref);
2256 @words = grep {$_ ne ""} @words; # remove empty words
2257 my $word_b = shift @words;
2258 my $docid = $docid_count + $docid_base;
2259 for my $word (@words) {
2260 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2261 my $hash = hash($word_b . $word);
2262 unless (defined $tmp{$hash}) {
2263 $tmp{$hash} = 1;
2264 $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
2265 $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
2266 # util::dprint("<$word_b, $word> $hash\n");
2267 $PhraseHashLast{$hash} = $docid;
2268 }
2269 $word_b = $word;
2270 }
2271 }
2272
2273 # Construct NMZ.p and NMZ.pi file. this processing is rather complex.
2274 sub write_phrase_hash () {
2275 write_phrase_hash_sub();
2276 util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
2277 util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
2278 }
2279
2280 sub write_phrase_hash_sub () {
2281 my $opened = 0;
2282
2283 return 0 if %PhraseHash eq ''; # namazu-devel-ja #3146
2284 util::dprint(_("doing write_phrase_hash() processing.\n"));
2285
2286 my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
2287 my $fh_tmp_p = util::efopen(">$var::NMZ{'__p'}");
2288
2289 my $fh_phrase = util::fopen($var::NMZ{'_p'});
2290 my $fh_phraseindex = undef;
2291 if ($fh_phrase) {
2292 $fh_phraseindex = util::efopen($var::NMZ{'_pi'});
2293 $opened = 1;
2294 }
2295
2296 my $ptr = 0;
2297 for (my $i = 0; $i < 65536; $i++) {
2298
2299 my $baserecord = "";
2300 my $baseleng = 0;
2301
2302 if ($opened) {
2303 my $tmp = 0;
2304 read($fh_phraseindex, $tmp, $var::INTSIZE);
2305 $tmp = unpack("N", $tmp);
2306 if ($tmp != 0xffffffff) { # 0xffffffff
2307 $baseleng = readw($fh_phrase);
2308 read($fh_phrase, $baserecord, $baseleng);
2309 }
2310 }
2311 if (defined($PhraseHash{$i})) {
2312 if ($baserecord eq "") {
2313 print $fh_tmp_pi pack("N", $ptr);
2314 my $record = $PhraseHash{$i};
2315 my $n2 = length($record);
2316 my $data = pack("w", $n2) . $record;
2317 print $fh_tmp_p $data;
2318 $ptr += length($data);
2319 } else {
2320 print $fh_tmp_pi pack("N", $ptr);
2321 my $record = $PhraseHash{$i};
2322 my $last_docid = get_last_docid($baserecord, 1);
2323 my $adjrecord = adjust_first_docid($record, $last_docid);
2324 check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
2325 $record = $adjrecord;
2326 my $n2 = length($record) + $baseleng;
2327 my $data = pack("w", $n2) . $baserecord . $record;
2328 print $fh_tmp_p $data;
2329 $ptr += length($data);
2330 }
2331 } else {
2332 if ($baserecord eq "") {
2333 # if $baserecord has no data, set to 0xffffffff
2334 print $fh_tmp_pi pack("N", 0xffffffff);
2335 } else {
2336 print $fh_tmp_pi pack("N", $ptr);
2337 my $data = pack("w", $baseleng) . $baserecord;
2338 print $fh_tmp_p $data;
2339 $ptr += length($data);
2340 }
2341 }
2342 }
2343
2344 if ($opened) {
2345 util::fclose($fh_phraseindex);
2346 }
2347 if (defined $fh_phrase) {
2348 util::fclose($fh_phrase);
2349 }
2350 util::fclose($fh_tmp_p);
2351 util::fclose($fh_tmp_pi);
2352
2353 %PhraseHash = ();
2354 %PhraseHashLast = ();
2355 }
2356
2357 # Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998)
2358 sub hash ($) {
2359 my ($word) = @_;
2360
2361 my $hash = 0;
2362 for (my $i = 0; $word ne ""; $i++) {
2363 $hash ^= $Seed[$i & 0x03][ord($word)];
2364 $word = substr $word, 1;
2365 # $word =~ s/^.//; is slower
2366 }
2367 return $hash & 65535;
2368 }
2369
2370 # Count frequencies of words.
2371 sub count_words ($$$$) {
2372 my ($docid_count, $docid_base, $contref, $kanji) = @_;
2373 my (@tmp);
2374
2375 # Normalize into small letter.
2376 $$contref =~ tr/A-Z/a-z/;
2377
2378 # Remove control char.
2379 $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
2380
2381 # It corresponds to -j option of ChaSen.
2382 $$contref =~ s/^[ \t\f]+//gm; # except "\r\n"
2383 $$contref =~ s/[ \t\f]+$//gm; # except "\r\n"
2384 $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi; # for hyphenation
2385 if (util::islang("ja")) {
2386 $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
2387 $$contref =~ s/(¡£|¡¢)/$1\n/gs;
2388 }
2389 $$contref =~ s/\n+/\n/gs;
2390
2391 # Do wakatigaki if necessary.
2392 if (util::islang("ja")) {
2393 wakati::wakatize_japanese($contref) if $kanji;
2394 }
2395
2396 my $part1 = "";
2397 my $part2 = "";
2398 if ($$contref =~ /\x7f/) {
2399 $part1 = substr $$contref, 0, index($$contref, "\x7f");
2400 $part2 = substr $$contref, index($$contref, "\x7f");
2401 # $part1 = $PREMATCH; # $& and friends are not efficient
2402 # $part2 = $MATCH . $POSTMATCH;
2403 } else {
2404 $part1 = $$contref;
2405 $part2 = "";
2406 }
2407
2408 # do scoring
2409 my %word_count = ();
2410 $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
2411 wordcount_sub($2, $1, \%word_count)!ge;
2412 wordcount_sub($part1, 1, \%word_count);
2413
2414 # Add them to whole index
2415 my $docid = $docid_count + $docid_base;
2416 for my $word (keys(%word_count)) {
2417 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2418 $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
2419 $KeyIndex{$word} .= pack("w2",
2420 $docid - $KeyIndexLast{$word},
2421 $word_count{$word});
2422 $KeyIndexLast{$word} = $docid;
2423 }
2424 }
2425
2426 #
2427 # Count words and do score weighting
2428 #
2429 sub wordcount_sub ($$\%) {
2430 my ($text, $weight, $word_count) = @_;
2431
2432 # Remove all symbols when -K option is specified.
2433 $text =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'};
2434
2435 # Count frequencies of words in a current document.
2436 # Handle symbols as follows.
2437 #
2438 # tcp/ip -> tcp/ip, tcp, ip
2439 # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
2440 # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
2441 #
2442 # Don't do processing for nested symbols.
2443 # NOTE: When -K is specified, all symbols are already removed.
2444
2445 my @words = split /\s+/, $text;
2446 for my $word (@words) {
2447 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2448 if ($var::Opt{'noedgesymbol'}) {
2449 # remove symbols at both ends
2450 $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
2451 }
2452 $word_count->{$word} = 0 unless defined($word_count->{$word});
2453 $word_count->{$word} += $weight;
2454 unless ($var::Opt{'nosymbol'}) {
2455 if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
2456 $word_count->{$1} = 0 unless defined($word_count->{$1});
2457 $word_count->{$1} += $weight;
2458 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2459 } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
2460 $word_count->{$1} = 0 unless defined($word_count->{$1});
2461 $word_count->{$1} += $weight;
2462 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2463 } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
2464 $word_count->{$1} = 0 unless defined($word_count->{$1});
2465 $word_count->{$1} += $weight;
2466 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2467 }
2468 my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
2469 if $word =~ /[^\xa1-\xfea-z_0-9]/;
2470 for my $tmp (@words_) {
2471 next if $tmp eq "";
2472 $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
2473 $word_count->{$tmp} += $weight;
2474 }
2475 @words_ = ();
2476 }
2477 }
2478 return "";
2479 }
2480
2481 # Construct NMZ.i and NMZ.ii file. this processing is rather complex.
2482 sub write_index () {
2483 my $key_count = write_index_sub();
2484 util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
2485 util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
2486
2487 return $key_count;
2488 }
2489
2490 # readw: read one pack 'w' word.
2491 # This code was contributed by <furukawa@tcp-ip.or.jp>.
2492 sub readw ($) {
2493 my $fh = shift;
2494 my $ret = '';
2495 my $c;
2496
2497 while (read($fh, $c, 1)){
2498 $ret .= $c;
2499 last unless 0x80 & ord $c;
2500 }
2501 return unpack('w', $ret);
2502 }
2503
2504 sub get_last_docid ($$) {
2505 my ($record, $step) = @_;
2506 my (@data) = unpack 'w*', $record;
2507
2508 my $sum = 0;
2509 for (my $i = 0; $i < @data; $i += $step) {
2510 $sum += $data[$i];
2511 }
2512 my $leng = @data / $step;
2513 return $sum;
2514 }
2515
2516 sub adjust_first_docid ($$) {
2517 my ($record, $last_docid) = @_;
2518 my (@data) = unpack 'w*', $record;
2519
2520 $data[0] = $data[0] - $last_docid;
2521 return undef if ($data[0] < 0); # namazu-bug-ja#31
2522 $record = pack 'w*', @data;
2523 return $record;
2524 }
2525
2526 sub write_index_sub () {
2527 my @words = sort keys(%KeyIndex);
2528 return 0 if $#words == -1;
2529
2530 my $cnt = 0;
2531 my $ptr_i = 0;
2532 my $ptr_w = 0;
2533 my $key_count = 0;
2534 my $baserecord = "";
2535
2536 util::dprint(_("doing write_index() processing.\n"));
2537 my $fh_tmp_i = util::efopen(">$var::NMZ{'__i'}");
2538 my $fh_tmp_w = util::efopen(">$var::NMZ{'__w'}");
2539 my $fh_i = util::fopen($var::NMZ{'_i'});
2540 my $fh_ii = util::efopen(">$var::NMZ{'_ii'}");
2541 my $fh_w = util::fopen($var::NMZ{'_w'});
2542 my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
2543
2544 if ($fh_w) {
2545 FOO:
2546 while (defined(my $line = <$fh_w>)) {
2547 chop $line;
2548 my $current_word = $line;
2549
2550 my $baseleng = readw($fh_i);
2551 read($fh_i, $baserecord, $baseleng);
2552
2553 for (; $cnt < @words; $cnt++) {
2554 last unless $words[$cnt] le $current_word;
2555 my $record = $KeyIndex{$words[$cnt]};
2556 my $leng = length($record);
2557
2558 if ($current_word eq $words[$cnt]) {
2559 my $last_docid = get_last_docid($baserecord, 2);
2560 my $adjrecord = adjust_first_docid($record, $last_docid);
2561 check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
2562 $record = $adjrecord;
2563 $leng = length($record); # re-measure
2564 my $tmp = pack("w", $leng + $baseleng);
2565
2566 my $data_i = "$tmp$baserecord$record";
2567 my $data_w = "$current_word\n";
2568 print $fh_tmp_i $data_i;
2569 print $fh_tmp_w $data_w;
2570 print $fh_ii pack("N", $ptr_i);
2571 print $fh_wi pack("N", $ptr_w);
2572 $ptr_i += length($data_i);
2573 $ptr_w += length($data_w);
2574 $key_count++;
2575
2576 $cnt++;
2577 next FOO;
2578 } else {
2579 my $tmp = pack("w", $leng);
2580 my $data_i = "$tmp$record";
2581 my $data_w = "$words[$cnt]\n";
2582 print $fh_tmp_i $data_i;
2583 print $fh_tmp_w $data_w;
2584 print $fh_ii pack("N", $ptr_i);
2585 print $fh_wi pack("N", $ptr_w);
2586 $ptr_i += length($data_i);
2587 $ptr_w += length($data_w);
2588 $key_count++;
2589 }
2590 }
2591 my $tmp = pack("w", $baseleng);
2592 my $data_i = "$tmp$baserecord";
2593 my $data_w = "$current_word\n";
2594 print $fh_tmp_i $data_i;
2595 print $fh_tmp_w $data_w;
2596 print $fh_ii pack("N", $ptr_i);
2597 print $fh_wi pack("N", $ptr_w);
2598 $ptr_i += length($data_i);
2599 $ptr_w += length($data_w);
2600 $key_count++;
2601 }
2602 }
2603 while ($cnt < @words) {
2604 my $leng = length($KeyIndex{$words[$cnt]});
2605 my $tmp = pack("w", $leng);
2606 my $record = $KeyIndex{$words[$cnt]};
2607
2608 my $data_i = "$tmp$record";
2609 my $data_w = "$words[$cnt]\n";
2610 print $fh_tmp_i $data_i;
2611 print $fh_tmp_w $data_w;
2612 print $fh_ii pack("N", $ptr_i);
2613 print $fh_wi pack("N", $ptr_w);
2614 $ptr_i += length($data_i);
2615 $ptr_w += length($data_w);
2616 $key_count++;
2617 $cnt++;
2618 }
2619 %KeyIndex = ();
2620 %KeyIndexLast = ();
2621
2622 util::fclose($fh_wi);
2623 util::fclose($fh_w) if (defined $fh_w);
2624 util::fclose($fh_ii);
2625 util::fclose($fh_i) if (defined $fh_i);
2626 util::fclose($fh_tmp_w);
2627 util::fclose($fh_tmp_i);
2628
2629 return $key_count;
2630 }
2631
2632 #
2633 # Decide the media type.
2634 # FIXME: Very ad hoc. It's just a compromise. -- satoru
2635 #
2636 sub decide_type ($$) {
2637 my ($name, $cont) = @_;
2638 return $name if (!defined $cont || $name eq $cont);
2639
2640 util::dprint("decide_type: name: $name, cont: $cont\n");
2641 if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
2642 return $name;
2643 } elsif ($cont =~ m!^application/octet-stream! &&
2644 $name !~ m!^text/!) {
2645 return $name;
2646 } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
2647 $name !~ m!^application/octet-stream!) {
2648 # FIXME: Currently File::MMagic 1.02's checktype_data()
2649 # is unreliable for them.
2650 return $name;
2651 } elsif ($cont =~ m!^application/x-zip! &&
2652 $name =~ m!^application/!) {
2653 # zip format is used other applications e.g. OpenOffice.
2654 # It is necessary to add to check extention.
2655 return $name;
2656 }
2657
2658 return $cont;
2659 }
2660
2661 #
2662 # Debugging code for the "negative numbers" problem.
2663 #
2664 sub check_records ($$$) {
2665 my ($recref, $baserecref, $step) = @_;
2666 dump_record($baserecref, $step);
2667 dump_record($recref, $step);
2668 print STDERR "The \x22negative number\x22 problem occurred.\n";
2669 exit(1);
2670 }
2671
2672 sub dump_record($$) {
2673 my ($recref, $step) = @_;
2674 my (@data) = unpack 'w*', $$recref;
2675 print STDERR "dump record data to NMZ.bug.info (step: $step)...";
2676 my $fh_info = util::fopen(">> NMZ.bug.info");
2677 print $fh_info "dumped record data (step: $step)...";
2678 foreach (@data) {
2679 print $fh_info sprintf(" %08x", $_);
2680 }
2681 print $fh_info "\n";
2682 util::fclose($fh_info);
2683 return;
2684 }
2685
2686 sub trapintr {
2687 my ($signame) = @_;
2688 print STDERR "Warning: signal $signame occured.\n";
2689 }
2690
2691 #
2692 # For avoiding "used only once: possible typo at ..." warnings.
2693 #
2694 muda($conf::ON_MEMORY_MAX,
2695 $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
2696 $conf::DENY_FILE, $var::INTSIZE,
2697 $conf::CHASEN_NOUN, $conf::CHASEN,
2698 $conf::KAKASI, $var::Opt{'okurigana'},
2699 $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
2700 $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
2701 $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
2702 $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
2703 $conf::ADDRESS, $var::MAILING_ADDRESS,
2704 $conf::FILE_SIZE_MAX,
2705 $conf::MECAB,
2706 $conf::DENY_DDN,
2707 $var::TRAC_URI,
2708 );
2709
2710 sub muda {}
2711