bin/mknmz

   1 #! /usr/bin/perl -w
   2 # -*- Perl -*-
   3 # mknmz - indexer of Namazu
   4 # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
   5 #
   6 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
   7 # Copyright (C) 2000-2008 Namazu Project All rights reserved.
   8 #     This is free software with ABSOLUTELY NO WARRANTY.
   9 #
  10 #  This program is free software; you can redistribute it and/or modify
  11 #  it under the terms of the GNU General Public License as published by
  12 #  the Free Software Foundation; either versions 2, or (at your option)
  13 #  any later version.
  14 #
  15 #  This program is distributed in the hope that it will be useful
  16 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 #  GNU General Public License for more details.
  19 #
  20 #  You should have received a copy of the GNU General Public License
  21 #  along with this program; if not, write to the Free Software
  22 #  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23 #  02111-1307, USA
  24 #
  25 #  This file must be encoded in EUC-JP encoding
  26 #
  27
  28 package mknmz;
  29 require 5.004;
  30 use English;
  31 use lib ".";
  32 use Cwd;
  33 use IO::File;
  34 use File::Find;
  35 use File::MMagic;
  36 use Time::Local;
  37 use strict;  # be strict since v1.2.0
  38 use Getopt::Long;
  39 use File::Copy;
  40 use DirHandle;
  41 use File::Basename;
  42
  43 use vars qw($SYSTEM);
  44 # It exists only for back compatibility.
  45 $SYSTEM = $English::OSNAME;
  46
  47 my $NAMAZU_INDEX_VERSION = "2.0";
  48
  49 my $CodingSystem = "euc";
  50 my $PKGDATADIR    = $ENV{'pkgdatadir'} || "/usr/share/namazu";
  51 my $CONFDIR       = "/etc/namazu";     # directory where mknmzrc are in.
  52 my $LIBDIR        = $PKGDATADIR . "/pl";      # directory where library etc. are in.
  53 my $FILTERDIR     = $PKGDATADIR . "/filter";   # directory where filters are in.
  54 my $TEMPLATEDIR   = $PKGDATADIR . "/template"; # directory where templates are in.
  55
  56 my $DeletedFilesCount = 0;
  57 my $UpdatedFilesCount = 0;
  58 my $APPENDMODE = 0;
  59 my %PhraseHash = ();
  60 my %PhraseHashLast = ();
  61 my %KeyIndex = ();
  62 my %KeyIndexLast = ();
  63 my %CheckPoint = ("on" => undef, "continue" => undef);
  64 my $ConfigFile = undef;
  65 my $MediaType  = undef;
  66
  67 my $ReplaceCode  = undef;  # perl code for transforming URI
  68 my @Seed = ();
  69 my @LoadedRcfiles = ();
  70 my $Magic = new File::MMagic;
  71
  72 my $ReceiveTERM = 0;
  73
  74 STDOUT->autoflush(1);
  75 STDERR->autoflush(1);
  76 main();
  77 sub main {
  78     my $start_time = time;
  79
  80     if ($English::PERL_VERSION == 5.008001) {
  81         unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
  82             print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
  83             exit 1;
  84         }
  85     }
  86
  87     init();
  88
  89     # At first, loading pl/conf.pl to prevent overriding some variables.
  90     preload_modules();
  91
  92     # set LANG and bind textdomain
  93     util::set_lang();
  94     textdomain('namazu', $util::LANG_MSG);
  95
  96     load_modules();
  97     my ($output_dir, @targets) = parse_options();
  98     my ($docid_base, $total_files_num) = prep($output_dir, @targets);
  99
 100     my $swap = 1;
 101     my $docid_count = 0;
 102     my $file_count = 0;
 103     my $total_files_size = 0;
 104     my $key_count = 0;
 105     my $checkpoint = 0;
 106     my $flist_ptr = 0;
 107     my $processed_files_size = 0;
 108
 109     if ($CheckPoint{'continue'}) {
 110         # Restore variables
 111         eval util::readfile($var::NMZ{'_checkpoint'}) ;
 112     } else {
 113         print $total_files_num . _(" files are found to be indexed.\n");
 114     }
 115
 116     {
 117         my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
 118         my $fh_flist = util::efopen($var::NMZ{'_flist'});
 119         my %field_indices = ();
 120         get_field_index_base(\%field_indices);
 121
 122         if ($CheckPoint{'continue'}) {
 123             seek($fh_flist, $flist_ptr, 0);
 124         }
 125
 126         # Process target files one by one
 127         while (defined(my $line = <$fh_flist>)) {
 128             $flist_ptr += length($line);
 129             my $cfile = $line;
 130             chomp $cfile;
 131             util::dprint(_("target file: ")."$cfile\n");
 132
 133             my ($cfile_size, $num) =
 134                 process_file($cfile, $docid_count, $docid_base,
 135                              $file_count, \%field_indices,
 136                              $fh_errorsfile, $total_files_num);
 137             if ($num == 0) {
 138                 $total_files_num--;
 139                 next;
 140             } else {
 141                 $docid_count += $num;
 142                 $file_count++;
 143             }
 144
 145             $total_files_size     += $cfile_size;
 146             $processed_files_size += $cfile_size;
 147             last if $ReceiveTERM;
 148             if ($processed_files_size > $conf::ON_MEMORY_MAX) {
 149                 if (%KeyIndex) {
 150                     $key_count = write_index();
 151                     print _("Writing index files...");
 152                     write_phrase_hash();
 153                     print "\n";
 154                 }
 155                 $processed_files_size = 0;
 156                 $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
 157             }
 158         }
 159
 160         util::fclose($fh_flist);
 161         util::fclose($fh_errorsfile);
 162     }
 163     # This should be out of above blocks because of file handler closing.
 164     re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
 165             $total_files_size, $total_files_num,
 166             $file_count, $key_count) if $checkpoint;
 167
 168     if (%KeyIndex) {
 169         $key_count = write_index();
 170         print _("Writing index files...");
 171         write_phrase_hash();
 172         print "\n";
 173     }
 174
 175     $key_count = get_total_keys() unless $key_count;
 176     do_remain_job($total_files_size, $docid_count, $key_count,
 177                    $start_time);
 178     exit 0;
 179 }
 180
 181 #
 182 # FIXME: Very complicated.
 183 #
 184 sub process_file ($$$$\%$$) {
 185     my ($cfile, $docid_count, $docid_base, $file_count,
 186         $field_indices, $fh_errorsfile, $total_files_num) = @_;
 187
 188     my $processed_num = 0;
 189     my $file_size = util::filesize($cfile);
 190
 191     if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
 192         my @parts;
 193         @parts = htmlsplit::split($cfile, "NMZ.partial")
 194             if ($file_size <= $conf::FILE_SIZE_MAX);
 195         if (@parts > 1) {
 196             my $id = 0;
 197             for my $part (@parts) {
 198                 next if (defined $conf::EXCLUDE_PATH &&
 199                          "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
 200                 my $fname = util::tmpnam("NMZ.partial.$id");
 201                 my $fragment  = defined $part ? $part : undef;
 202                 my $uri   = generate_uri($cfile, $fragment);
 203                 my $result = namazu_core($fname,
 204                                          $docid_count + $processed_num,
 205                                          $docid_base, $file_count,
 206                                          $field_indices, $fh_errorsfile,
 207                                          $total_files_num,
 208                                          $uri, $id, $#parts);
 209                 if ($result > 0) {
 210                     $processed_num++;
 211                     my $rname = defined $part ? "$cfile\t$part" : "$cfile";
 212                     put_registry($rname);
 213                 }
 214                 unlink $fname;
 215                 $id++;
 216             }
 217             return ($file_size, $processed_num);
 218         }
 219     }
 220     my $result = namazu_core($cfile, $docid_count, $docid_base,
 221                              $file_count, $field_indices,
 222                              $fh_errorsfile, $total_files_num,
 223                              undef, undef, undef);
 224     if ($result > 0) {
 225         $processed_num++;
 226         put_registry($cfile);
 227     }
 228
 229     return ($file_size, $processed_num);
 230 }
 231
 232 #
 233 # Load mknmzrcs:
 234 #
 235 #  1. MKNMZRC environment
 236 #
 237 #  2. $(sysconfdir)/$(PACKAGE)/mknmzrc
 238 #
 239 #  3. ~/.mknmzrc
 240 #
 241 #  4. user-specified mknmzrc set by mknmz --config=file option.
 242 #
 243 # If multiple files exists, read all of them.
 244 #
 245 sub load_rcfiles () {
 246     my (@cand) = ();
 247
 248     # To support Windows. Since they have nasty drive letter convention,
 249     # it is necessary to change mknmzrc dynamically with env. variable.
 250     push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
 251     push @cand, "$CONFDIR/mknmzrc";
 252     push @cand, "$ENV{'HOME'}/.mknmzrc";
 253
 254     util::vprint(_("Reading rcfile: "));
 255     for my $rcfile (@cand) {
 256         if (-f $rcfile) {
 257                 load_rcfile ($rcfile);
 258             util::vprint(" $rcfile");
 259         }
 260     }
 261     util::vprint("\n");
 262 }
 263
 264 sub load_rcfile ($) {
 265     my ($rcfile) = @_;
 266     if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
 267         util::win32_yen_to_slash(\$rcfile);
 268     }
 269     return if (grep {m/^$rcfile$/} @LoadedRcfiles);
 270     do $rcfile;
 271     if ($@) {
 272         chop $@;
 273         push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
 274     }else {
 275         push @LoadedRcfiles, $rcfile;
 276     }
 277
 278     # Dirty workaround.
 279     $LIBDIR = $conf::LIBDIR
 280         if (defined $conf::LIBDIR && -d $conf::LIBDIR);
 281     $FILTERDIR = $conf::FILTERDIR
 282         if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
 283     $TEMPLATEDIR = $conf::TEMPLATEDIR
 284         if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
 285 }
 286
 287 sub re_exec($$$$$$$$) {
 288     my ($flist_ptr, $docid_count, $docid_base, $start_time,
 289         $total_files_size, $total_files_num, $file_count, $key_count) = @_;
 290
 291     # store variables
 292     {
 293         my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
 294
 295         print $fh_checkpoint <<EOM;
 296         \$DeletedFilesCount = $DeletedFilesCount;
 297         \$UpdatedFilesCount = $UpdatedFilesCount;
 298         \$APPENDMODE = $APPENDMODE;
 299         \$flist_ptr = $flist_ptr;
 300         \$docid_count = $docid_count;
 301         \$docid_base = $docid_base;
 302         \$start_time = $start_time;
 303         \$total_files_size = $total_files_size;
 304         \$total_files_num = $total_files_num;
 305         \$key_count = $key_count;
 306         \$file_count = $file_count;
 307         \$\$ = $$;
 308 EOM
 309         util::fclose($fh_checkpoint);
 310     }
 311
 312     @ARGV = ("-S", @ARGV) ;
 313     print _("Checkpoint reached: re-exec mknmz...\n");
 314     util::dprint(join ' ', ("::::", @ARGV, "\n"));
 315     exec ($0, @ARGV) ;
 316 }
 317
 318 sub put_registry ($) {
 319     my ($filename) = @_;
 320     my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
 321     print $fh_registry $filename, "\n";
 322     util::fclose($fh_registry);
 323 }
 324
 325
 326 # Initialization
 327 #   $CodingSystem: Character Coding System 'euc' or 'sjis'
 328 sub init () {
 329     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 330         $CodingSystem = "sjis";
 331         if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
 332             $CONFDIR = $1 . $CONFDIR ;
 333         }
 334         if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
 335             $LIBDIR = $1 . $LIBDIR ;
 336         }
 337         if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
 338             $FILTERDIR = $1 . $FILTERDIR ;
 339         }
 340         if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
 341             $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
 342         }
 343     } else {
 344         $CodingSystem = "euc";
 345     }
 346
 347     $SIG{'INT'}  = sub {
 348         util::cdie("SIGINT caught! Aborted.\n");
 349     };
 350
 351     $SIG{'TERM'}  = sub {
 352         print STDERR "SIGTERM caught!\n";
 353         $ReceiveTERM = 1;
 354     };
 355 }
 356
 357 sub preload_modules () {
 358     unshift @INC, $LIBDIR;
 359     # workaround for test suites.
 360     unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
 361
 362     require "var.pl" || die "unable to require \"var.pl\"\n";
 363     require "conf.pl" || die "unable to require \"conf.pl\"\n";
 364     require "util.pl" || die "unable to require \"util.pl\"\n";
 365     require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
 366     require "ext.pl" || die "unable to require \"ext.pl\"\n";
 367 }
 368
 369 sub postload_modules () {
 370     require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
 371 }
 372
 373 sub load_modules () {
 374     require "usage.pl" || die "unable to require \"usage.pl\"\n";
 375     require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
 376     require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
 377     require "seed.pl" || die "unable to require \"seed.pl\"\n";
 378     require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
 379
 380     @Seed = seed::init();
 381 }
 382
 383 sub load_filtermodules () {
 384     unshift @INC, $FILTERDIR;
 385
 386     #
 387     # Windows modules must be loaded first.
 388     # Because OLE filters have low precedence over normal ones.
 389     #
 390     load_win32modules() if $English::OSNAME eq "MSWin32";
 391
 392     # Check filter modules
 393     my @filters = ();
 394     @filters = glob "$FILTERDIR/*.pl";
 395
 396     load_filters(@filters);
 397 }
 398
 399 sub load_win32modules () {
 400     # Check filter modules
 401     my @filters = ();
 402     if (-f "../filter/win32/olemsword.pl") { # to ease developing
 403         @filters = glob "../filter/win32/*.pl";
 404         unshift @INC, "../filter/win32";
 405     } else {
 406         @filters = glob "$FILTERDIR/win32/*.pl";
 407         unshift @INC, "$FILTERDIR/win32";
 408     }
 409
 410     load_filters(@filters);
 411 }
 412
 413 sub load_filters (@) {
 414     my @filters = @_;
 415
 416     for my $filter (@filters) {
 417         $filter =~ m!([-\w]+)\.pl$!;
 418         my $module = $1;
 419         require "$module.pl" || die "unable to require \"$module.pl\"\n";;
 420         my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
 421
 422         eval "\@mtypes =    ${module}::mediatype();";
 423         die $@ if $@;  # eval error
 424         eval "\$status =    ${module}::status();";
 425         die $@ if $@;
 426         eval "\$recursive = ${module}::recursive();";
 427         die $@ if $@;
 428         eval "\$pre_codeconv  = ${module}::pre_codeconv();";
 429         die $@ if $@;
 430         eval "\$post_codeconv  = ${module}::post_codeconv();";
 431         die $@ if $@;
 432         eval "${module}::add_magic(\$Magic);";
 433         die $@ if $@;
 434
 435         for my $mt (@mtypes) {
 436         next if (defined $var::Supported{$mt} &&
 437                  $var::Supported{$mt} eq 'yes' && $status eq 'no');
 438             $var::Supported{$mt} = $status;
 439             $var::REQUIRE_ACTIONS{$mt} = $module;
 440             $var::RECURSIVE_ACTIONS{$mt} = $recursive;
 441             $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
 442             $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
 443         }
 444     }
 445 }
 446
 447 # Core routine.
 448 #
 449 # FIXME: Too many parameters. They must be cleared.
 450 #
 451 sub namazu_core ($$$$$$$$$$) {
 452     my ($cfile, $docid_count, $docid_base,
 453         $file_count, $field_indices, $fh_errorsfile, $total_files_num,
 454         $uri, $part_id, $part_num) = @_;
 455
 456     my $headings = "";
 457     my $content = "";
 458     my $weighted_str = "";
 459     my %fields;
 460     my $msg_prefix;
 461
 462     if ($part_id) {
 463         $msg_prefix = "    $part_id/$part_num - ";
 464     } else {
 465         $msg_prefix = $file_count + 1 . "/$total_files_num - ";
 466     }
 467
 468     unless ($uri) {
 469         $uri = generate_uri($cfile);  # Make a URI from a file name.
 470     }
 471     my ($cfile_size, $text_size, $kanji, $mtype) =
 472         load_document(\$cfile, \$content, \$weighted_str,
 473                       \$headings, \%fields);
 474
 475     {
 476         $fields{'mtime'} = (stat($cfile))[9];
 477         my $utc = $fields{'mtime'};
 478         $utc = time::rfc822time_to_mtime($fields{'date'})
 479                 if (defined $fields{'date'});
 480         if ($utc == -1) {
 481             my $date = $fields{'date'};
 482             print "$cfile Illegal date format. : $date\n";
 483             print $fh_errorsfile "$cfile Illegal date format. : $date\n";
 484             $utc = $fields{'mtime'};
 485             delete $fields{'date'};
 486         }
 487         $fields{'utc'} = $utc;
 488     }
 489
 490     util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
 491
 492     # Check if the file is acceptable.
 493     my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
 494     if (defined $err) {
 495         if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 496             my $uri2 = codeconv::eucjp_to_shiftjis($uri);
 497             print $msg_prefix . "$uri2 $err\n";
 498         } else {
 499             print $msg_prefix . "$uri $err\n";
 500         }
 501         print $fh_errorsfile "$cfile $err\n";
 502         return 0;  # return 0 if error
 503     }
 504
 505     # Print processing file name as URI.
 506     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 507         my $uri2 = codeconv::eucjp_to_shiftjis($uri);
 508         print $msg_prefix . "$uri2 [$mtype]\n";
 509     } else {
 510         print $msg_prefix . "$uri [$mtype]\n";
 511     }
 512
 513     # Add filename.
 514     my $filename = defined $cfile ? $cfile : '';
 515     codeconv::toeuc(\$filename);
 516     $filename = basename($filename);
 517     $fields{'filename'} = $filename;
 518
 519     complete_field_info(\%fields, $cfile, $uri,
 520                         \$headings, \$content, \$weighted_str);
 521     put_field_index(\%fields, $field_indices);
 522
 523     put_dateindex($cfile);
 524     $content .= "\n\n$filename\n\n";    # add filename
 525     $content .= $weighted_str;          # add weights
 526     count_words($docid_count, $docid_base, \$content, $kanji);
 527     make_phrase_hash($docid_count, $docid_base, \$content);
 528
 529     # assertion
 530     util::assert($cfile_size != 0,
 531                  "cfile_size == 0 at the end of namazu_core.");
 532
 533     return $cfile_size;
 534 }
 535
 536 #
 537 # Make the URI from the given file name.
 538 #
 539 sub generate_uri (@) {
 540     my ($file, $fragment) = @_;
 541     return "" unless defined $file;
 542
 543     # omit a file name if omittable
 544     $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
 545
 546     if (defined $ReplaceCode) {
 547         # transforming URI by evaling
 548         $_ = $file;
 549         eval $ReplaceCode;
 550         $file = $_;
 551     }
 552
 553     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 554         $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
 555     }
 556
 557     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 558         $file = codeconv::shiftjis_to_eucjp($file);
 559     }
 560     if (defined $fragment) {
 561         codeconv::toeuc(\$fragment);
 562     }
 563
 564     unless ($var::Opt{'noencodeuri'}) {
 565         for my $tmp ($file, $fragment) {
 566             next unless defined $tmp;
 567
 568             # Escape unsafe characters (not strict)
 569             $tmp =~ s/\%/%25/g;  # Convert original '%' into '%25' v1.1.1.2
 570             $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
 571                 sprintf("%%%02X",ord($1))/ge;
 572         }
 573     }
 574
 575
 576     my $uri = $file;
 577     $uri .= "#" . $fragment if defined $fragment;
 578     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
 579         # restore '|' for drive letter rule of Win32, OS/2
 580         $uri =~ s!^/([A-Z])%7C!/$1|!i;
 581     }
 582     return $uri;
 583 }
 584
 585
 586 sub get_field_index_base (\%) {
 587     my ($field_indices) = @_;
 588
 589     my @keys = split('\|', $conf::SEARCH_FIELD);
 590     if ($var::Opt{'meta'}) {
 591         push @keys, (split '\|', $conf::META_TAGS);
 592     }
 593     for my $key (@keys) {
 594         $key = lc($key);
 595         my $fname    = "$var::NMZ{'field'}.$key";
 596         my $tmp_fname = util::tmpnam("NMZ.field.$key");
 597         my $size = 0;
 598         $size = -s $fname if -f $fname;
 599         $size += -s $tmp_fname if -f $tmp_fname;
 600         $field_indices->{$key} = $size;
 601     }
 602 }
 603
 604 sub complete_field_info (\%$$\$\$\$) {
 605     my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
 606
 607     for my $field (keys %{$fields}) {
 608         if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
 609             delete $fields->{$field};
 610         }
 611     }
 612
 613     unless (defined($fields->{'title'})) {
 614         $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
 615     }
 616     unless (defined($fields->{'date'})) {
 617         my $mtime = $fields->{'mtime'};
 618         my $date = util::rfc822time($mtime);
 619         $fields->{'date'} = $date;
 620     }
 621     unless (defined($fields->{'uri'})) {
 622         $fields->{'uri'} = $uri;
 623     }
 624     unless (defined($fields->{'size'})) {
 625         $fields->{'size'} = -s $cfile;
 626     }
 627     unless (defined($fields->{'summary'})) {
 628         $fields->{'summary'} = make_summary($contref, $headings, $cfile);
 629     }
 630     unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
 631         $fields->{'from'} = getmsg("unknown");
 632     }
 633 }
 634
 635 #
 636 # Currently, messages for NMZ.* files should be encoded in
 637 # EUC-JP currently. We cannot use gettext.pl for the messsage
 638 # because gettext.pl may use Shift_JIS encoded messages.
 639 # So, we should use the function instead of gettext().
 640 #
 641 # FIXME: Ad hoc impl.  getmsg() is effective only for "unknown".
 642 #
 643 sub getmsg($) {
 644     my ($msg) = @_;
 645
 646     if (util::islang_msg("ja")) {
 647         if ($msg eq "unknown") {
 648             return "ÉÔÌÀ";
 649         }
 650     }
 651     return $msg;
 652 }
 653
 654 sub make_summary ($$$) {
 655     my ($contref, $headings, $cfile) = @_;
 656
 657     # pick up $conf::MAX_FIELD_LENGTH bytes string
 658     my $tmp = "";
 659     if ($$headings ne "") {
 660         $$headings =~ s/^\s+//;
 661         $$headings =~ s/\s+/ /g;
 662         $tmp = $$headings;
 663     } else {
 664         $tmp = "";
 665     }
 666
 667     my $offset = 0;
 668     my $tmplen = 0;
 669     while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
 670            && $offset < length($$contref))
 671     {
 672         $tmp .= substr $$contref, $offset, $tmplen;
 673         $offset += $tmplen;
 674         $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
 675         $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
 676     }
 677
 678     # -1 means "LF"
 679     my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
 680     # Remove a garbage Kanji 1st char at the end.
 681     $summary = codeconv::chomp_eucjp($summary);
 682
 683     $summary =~ s/^\s+//;
 684     $summary =~ s/\s+/ /g;   # normalize white spaces
 685
 686     return $summary;
 687 }
 688
 689
 690 # output the field infomation into NMZ.fields.* files
 691 sub put_field_index (\%$) {
 692     my ($fields, $field_indices) = @_;
 693
 694     my $aliases_regex =
 695         join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
 696
 697     for my $field (keys %{$fields}) {
 698         util::dprint("Field: $field: $fields->{$field}\n");
 699         if ($field =~ /^($aliases_regex)$/o) {
 700             unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
 701                 $fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
 702             }
 703             undef $fields->{$field};
 704         }
 705     }
 706
 707     my @keys = split '\|', $conf::SEARCH_FIELD;
 708     if ($var::Opt{'meta'}) {
 709         my @meta = split '\|', $conf::META_TAGS;
 710         while (my $meta = shift(@meta)) {
 711             $meta = $conf::FIELD_ALIASES{$meta}
 712                 if (defined $conf::FIELD_ALIASES{$meta});
 713
 714             push @keys, $meta;
 715         }
 716
 717         # uniq @keys
 718         my %mark = ();
 719         @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
 720     }
 721     for my $key (@keys) {
 722         my $lkey = lc($key);
 723         my $fname    = util::tmpnam("NMZ.field.$lkey");
 724         my $fh_field = util::efopen(">>$fname");
 725         my $output = "";
 726         if (defined($fields->{$key})) {
 727             if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
 728                 $fields->{$key} =~ s/\s+/ /g;
 729                 $fields->{$key} =~ s/\s+$//;
 730                 $fields->{$key} =~ s/^\s+//;
 731             }
 732             $output = $fields->{$key};
 733
 734             # -1 means "LF"
 735             $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
 736             # Remove a garbage Kanji 1st char at the end.
 737             $output = codeconv::chomp_eucjp($output);
 738
 739             $output =~ s/\n.*$//s;
 740             $output .= "\n";
 741         } else {
 742             $output = "\n";
 743         }
 744         print $fh_field $output;
 745         util::fclose($fh_field);
 746
 747         # put index of field index
 748         {
 749             my $fname        = util::tmpnam("NMZ.field.$lkey.i");
 750             my $fh_field_idx = util::efopen(">>$fname");
 751             print $fh_field_idx pack("N", $field_indices->{$lkey});
 752             $field_indices->{$lkey} += length $output;
 753             util::fclose($fh_field_idx);
 754         }
 755     }
 756
 757 }
 758
 759 # put the date infomation into NMZ.t file
 760 sub put_dateindex ($) {
 761     my ($cfile) = @_;
 762     my $mtime = (stat($cfile))[9];
 763
 764     my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
 765     print $fh_dataindex pack("N", $mtime);
 766     util::fclose($fh_dataindex);
 767 }
 768
 769
 770 # load a document file
 771 sub load_document ($$$$\%) {
 772     my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
 773       = @_;
 774     my $cfile = $$orig_cfile;
 775
 776     return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
 777
 778     # for handling a filename which contains Shift_JIS code for Windows.
 779     # for handling a filename which contains including space.
 780     my $shelter_cfile = "";
 781     if (($cfile =~ /\s/) ||
 782         ($English::OSNAME eq "MSWin32"
 783         && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
 784     {
 785         $shelter_cfile = $cfile;
 786         $cfile = util::tmpnam("NMZ.win32");
 787         unlink $cfile if (-e $cfile);
 788         copy($shelter_cfile, $cfile);
 789     }
 790
 791     my $file_size;
 792     $file_size = util::filesize($cfile); # not only file in feature.
 793     if ($file_size > $conf::FILE_SIZE_MAX) {
 794         return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
 795     }
 796
 797     $$contref = util::readfile($cfile);
 798 #    $file_size = length($$contref);
 799
 800     my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
 801
 802     if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
 803         unlink $cfile;
 804         $cfile = $shelter_cfile;
 805     }
 806
 807     # Measure the text size at this time.
 808     my $text_size = length($$contref) + length($$weighted_str);
 809
 810     return ($file_size, $text_size, $kanji, $mtype);
 811 }
 812
 813 sub apply_filter($$$$$$$) {
 814     my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
 815       = @_;
 816     my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
 817
 818     # Filtering process.
 819     my $mtype;
 820     my $called_dt = 0;
 821     while (1) {
 822         if (defined $MediaType) {
 823             $mtype = $MediaType;
 824         } elsif (defined $mmtype) {
 825             $mtype = $mmtype;
 826         } else {
 827             my $mtype_n = $Magic->checktype_byfilename($cfile);
 828             my $mtype_c = $Magic->checktype_data($$contref);
 829             my $mtype_m;
 830             $mtype_m = $Magic->checktype_magic($$contref)
 831               if ((! defined $mtype_c) ||
 832                   $mtype_c =~
 833                   /^(text\/html|text\/plain|application\/octet-stream)$/);
 834             $mtype_c = $mtype_m
 835                 if (defined $mtype_m &&
 836                     $mtype_m !~
 837                     /^(text\/html|text\/plain|application\/octet-stream)$/);
 838             $mtype_c = 'text/plain' unless defined $mtype_c;
 839             if ($called_dt) {
 840                 $mtype = $mtype_c;
 841             } else {
 842                 $mtype = decide_type($mtype_n, $mtype_c);
 843                 $called_dt = 1;
 844             }
 845         }
 846         util::dprint(_("Detected type: ")."$mtype\n");
 847
 848         # Pre code conversion.
 849         if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
 850             util::dprint("pre_codeconv\n");
 851             codeconv_document($contref);
 852         }
 853
 854         if (! $var::Supported{$mtype} ||
 855             $var::Supported{$mtype} ne 'yes')
 856         {
 857             util::vprint(_("Unsupported media type ")."$mtype\n");
 858             return (0, "$mtype; x-system=unsupported");
 859         }
 860
 861         if ($var::REQUIRE_ACTIONS{$mtype}) {
 862             util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
 863             require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
 864                 || die _("unable to require ") .
 865                     "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
 866             my $err = undef;
 867             {
 868                 local $SIG{'PIPE'} = \&trapintr;
 869                 eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
 870                   '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
 871             }
 872             if ($err) {
 873                 if ($err =~ m/; x-system=unsupported$/) {
 874                     return (0, $err);
 875                 }
 876                 return (0, "$mtype; x-error=$err");
 877             }
 878
 879             if ($@) {
 880                 util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
 881                 return (0, "$mtype; x-error=$@");
 882             }
 883
 884             # Post code conversion.
 885             if ($var::REQUIRE_POST_CODECONV{$mtype}) {
 886                 util::dprint("post_codeconv\n");
 887                 codeconv_document($contref);
 888             }
 889
 890             next if ($var::RECURSIVE_ACTIONS{$mtype});
 891         }
 892         last;
 893     }
 894
 895     my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/;  # Kanji contained?
 896     $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
 897
 898     return ($kanji, $mtype);
 899 }
 900
 901 sub codeconv_document ($) {
 902     my ($textref) = @_;
 903     codeconv::toeuc($textref);
 904     $$textref =~ s/\r\n/\n/g;
 905     $$textref =~ s/\r/\n/g;
 906     $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
 907 }
 908
 909 sub prep () {
 910     my $docid_base = 0;
 911     my $output_dir = shift @_ ;
 912     my @targets = @_ ;
 913     my @flist = ();
 914
 915     $var::OUTPUT_DIR = $output_dir;
 916
 917     require_modules();
 918     change_filenames();
 919     check_present_index();
 920
 921     # if Checkpoint mode, return
 922     return (0, 0) if $CheckPoint{'continue'};
 923
 924     check_lockfile($var::NMZ{'lock2'});
 925     print _("Looking for indexing files...\n");
 926     @flist = find_target(@targets);
 927     ($docid_base, @flist) = append_index(@flist)
 928         if -f $var::NMZ{'r'};
 929     unless (@flist) { # if @flist is empty
 930         print _("No files to index.\n");
 931         exit 0;
 932     }
 933     set_lockfile($var::NMZ{'lock2'});
 934     save_flist(@flist);
 935     my $total_files_num = @flist;
 936
 937     return ($docid_base, $total_files_num);
 938 }
 939
 940 sub save_flist(@) {
 941     my @flist = @_;
 942     return if (@flist == 0);
 943
 944     my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
 945     print $fh_flist join("\n", @flist), "\n";
 946     util::fclose($fh_flist);
 947 }
 948
 949 sub require_modules() {
 950     if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
 951         require NKF || die "unable to require \"NKF\"\n";
 952         util::dprint(_("code conversion: using NKF module\n"));
 953         $var::USE_NKF_MODULE = 1;
 954     }
 955     if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
 956         require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
 957         util::dprint(_("wakati: using Text::Kakasi module\n"));
 958         my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
 959     }
 960     if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
 961         require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
 962         util::dprint(_("wakati: using Text::ChaSen module\n"));
 963         my @arg = ('-i', 'e', '-j', '-F', '%m ');
 964         @arg    = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
 965         my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
 966     }
 967     if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
 968         require MeCab || die "unable to require \"MeCab\"\n";
 969         util::dprint(_("wakati: using MeCab module\n"));
 970     }
 971 }
 972
 973 sub check_lockfile ($) {
 974     # warn if check file exists in case other process is running or abnormal
 975     # stop execution (later is not the major purpose, though).
 976     # This is mainly for early detection before longish find_target.
 977     my ($file) = @_;
 978
 979     if (-f $file) {
 980         print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
 981         exit 1;
 982     }
 983 }
 984
 985 sub set_lockfile ($) {
 986     my ($file) = @_;
 987
 988     # make a lock file
 989     if (-f $file) {
 990         print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
 991         exit 1;
 992     } else {
 993         my $fh_lockfile = util::efopen(">$file");
 994         print $fh_lockfile "$$"; # save pid
 995         util::fclose($fh_lockfile);
 996     }
 997 }
 998
 999 sub remove_lockfile ($) {
1000     my ($file) = @_;
1001
1002     # remove lock file
1003     unlink $file if -f $file;
1004 }
1005
1006 # check present index whether it is old type of not
1007 sub check_present_index () {
1008     if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}")
1009     {
1010         util::cdie(_("Present index is old type. it's unsupported.\n"));
1011     }
1012 }
1013
1014 # remain
1015 sub do_remain_job ($$$$) {
1016     my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
1017
1018     if ($docid_count == 0) {
1019         # No files are indexed
1020         if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1021             update_dateindex();
1022             update_registry($docid_count);
1023         }
1024     } else {
1025         set_lockfile($var::NMZ{'lock'});
1026         write_version();
1027         write_body_msg();
1028         write_tips_msg();
1029         write_result_file();
1030         update_field_index();
1031         update_dateindex();
1032         update_registry($docid_count);
1033         write_nmz_files();
1034         make_slog_file();
1035         remove_lockfile($var::NMZ{'lock'});
1036     }
1037     make_headfoot_pages($docid_count, $key_count);
1038     put_log($total_files_size, $start_time, $docid_count, $key_count);
1039     util::remove_tmpfiles();
1040     unlink $var::NMZ{'_flist'};
1041 }
1042
1043 sub make_headfoot_pages($$) {
1044     my ($docid_count, $key_count) = @_;
1045
1046     for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
1047         if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
1048             my $suffix = $1 ? $1 : '';
1049             make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
1050         }
1051     }
1052     for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
1053         if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
1054             my $suffix = $1 ? $1 : '';
1055             make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
1056        }
1057     }
1058 }
1059
1060 # Parse command line options.
1061 sub parse_options
1062 {
1063     if (@ARGV == 0) {
1064         show_mini_usage();
1065         exit 1;
1066     }
1067
1068     my @targets = ();
1069     my $targets_loaded = 0;
1070     my @argv = @ARGV;
1071     my $cwd = cwd();
1072
1073     my $opt_dummy = 0;
1074     my $opt_version = 0;
1075     my $opt_help = 0;
1076     my $opt_all = 0;
1077     my $opt_chasen = 0;
1078     my $opt_chasen_noun = 0;
1079     my $opt_kakasi = 0;
1080     my $opt_mecab = 0;
1081     my $opt_checkpoint_sub = 0;
1082     my $opt_show_config = 0;
1083     my $opt_mailnews = 0;
1084     my $opt_mhonarc = 0;
1085     my $opt_norc = 0;
1086
1087     my $opt_quiet = undef;
1088     my $opt_config = undef;
1089     my $output_dir = undef;
1090     my $update_index = undef;
1091     my $include_file = undef;
1092     my $target_list = undef;
1093     my $index_lang = undef;
1094
1095     my %opt_conf;
1096
1097 #    Getopt::Long::Configure('bundling');
1098     Getopt::Long::config('bundling');
1099     GetOptions(
1100                '0|help'              => \$opt_help,
1101                '1|exclude=s'         => \$opt_conf{'EXCLUDE_PATH'},
1102                '2|deny=s'            => \$opt_conf{'DENY_FILE'},
1103                '3|allow=s'           => \$opt_conf{'ALLOW_FILE'},
1104                '4|update=s'          => \$update_index,
1105                '5|mhonarc'           => \$opt_mhonarc,
1106                '6|mtime=s'           => \$var::Opt{'mtime'},
1107                '7|html-split'        => \$var::Opt{'htmlsplit'},
1108                'C|show-config'       => \$opt_show_config,
1109                'E|no-edge-symbol'    => \$var::Opt{'noedgesymbol'},
1110                'F|target-list=s'     => \$target_list,
1111                'G|no-okurigana'      => \$var::Opt{'okurigana'},
1112                'H|no-hiragana'       => \$var::Opt{'hiragana'},
1113                'I|include=s'         => \$include_file,
1114                'K|no-symbol'         => \$var::Opt{'nosymbol'},
1115                'L|indexing-lang=s'     => \$index_lang,
1116                'M|meta'              => \$var::Opt{'meta'},
1117                'O|output-dir=s'      => \$output_dir,
1118                'S|checkpoint-sub'    => \$opt_checkpoint_sub,
1119                'T|template-dir=s'    => \$TEMPLATEDIR,
1120                'U|no-encode-uri'     => \$var::Opt{'noencodeuri'} ,
1121                'V|verbose'           => \$var::Opt{'verbose'},
1122                'Y|no-delete'         => \$var::Opt{'nodelete'},
1123                'Z|no-update'         => \$var::Opt{'noupdate'},
1124                'a|all'               => \$opt_all,
1125                'b|use-mecab'         => \$opt_mecab,
1126                'c|use-chasen'        => \$opt_chasen,
1127                'd|debug'             => \$var::Opt{'debug'},
1128                'e|robots'            => \$var::Opt{'robotexclude'},
1129                'f|config=s'          => \$opt_config,
1130                'h|mailnews'          => \$opt_mailnews,
1131                'k|use-kakasi'        => \$opt_kakasi,
1132                'm|use-chasen-noun'   => \$opt_chasen_noun,
1133                'q|quiet'             => \$opt_quiet,
1134                'r|replace=s'         => \$ReplaceCode,
1135                's|checkpoint'        => \$CheckPoint{'on'},
1136                't|media-type=s'      => \$MediaType,
1137                'u|uuencode'          => \$opt_dummy, # for backward compat.
1138                'v|version'           => \$opt_version,
1139                'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
1140                'z|check-filesize'    => \$var::Opt{'checkfilesize'},
1141                'decode-base64'       => \$var::Opt{'decodebase64'},
1142                'norc'                => \$opt_norc,
1143                );
1144
1145     if ($opt_quiet) {
1146         # Make STDOUT quiet by redirecting STDOUT to null device.
1147         my $devnull = util::devnull();
1148         open(STDOUT, ">$devnull") || die "$devnull: $!";
1149     }
1150
1151     if (defined $update_index) {
1152         unless (-d $update_index) {
1153             print _("No such index: "), "$update_index\n";
1154             exit 1;
1155         }
1156
1157         my $orig_status = $var::NMZ{'status'};
1158         $var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
1159
1160         my $argv = get_status("argv");
1161         if (!defined $argv) {
1162             print _("No such index: "), "$update_index\n";
1163             exit 1;
1164         }
1165         @ARGV = split /\t/, $argv;
1166         util::dprint(_("Inherited argv: ")."@ARGV\n");
1167
1168         my $cwd  = get_status("cwd");
1169         if (!defined $cwd) {
1170             print _("No such index: "), "$update_index\n";
1171             exit 1;
1172         }
1173         chdir $cwd;
1174         util::dprint(_("Inherited cwd: ")."$cwd\n");
1175
1176         ($output_dir, @targets) = parse_options();
1177         $output_dir = $update_index;
1178         $var::NMZ{'status'} = $orig_status;  # See also change_filenames()
1179         return ($output_dir, @targets);
1180     }
1181
1182     if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
1183         load_rcfiles();
1184     }
1185     if ($opt_config) {
1186         if (-f $opt_config) {
1187             util::vprint(_("Reading rcfile: "));
1188             load_rcfile($ConfigFile = $opt_config);
1189             util::vprint(" $opt_config\n");
1190         }
1191     }
1192
1193     if ($index_lang) {
1194         $util::LANG = $index_lang;
1195       util::dprint("Override indexing language: $util::LANG\n");
1196     }
1197
1198     if ($opt_help) {
1199         show_usage();
1200         exit 1;
1201     }
1202
1203     if ($opt_version) {
1204         show_version();
1205         exit 1;
1206     }
1207
1208     load_filtermodules(); # to make effect $opt_config, $index_lang.
1209     postload_modules();
1210
1211     foreach my $key (keys %opt_conf){
1212         if (defined ($opt_conf{$key})) {
1213              ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
1214         }
1215     }
1216
1217     if ($opt_mailnews) {
1218         $MediaType = 'message/rfc822';
1219     }
1220     if ($opt_mhonarc) {
1221         $MediaType = 'text/html; x-type=mhonarc';
1222     }
1223     if ($opt_all) {
1224         $conf::ALLOW_FILE = ".*";
1225     }
1226     if ($opt_chasen) {
1227         $conf::WAKATI = $conf::CHASEN;
1228         $var::Opt{'noun'} = 0;
1229     }
1230     if ($opt_chasen_noun) {
1231         $conf::WAKATI = $conf::CHASEN_NOUN;
1232         $var::Opt{'noun'} = 1;
1233     }
1234     if ($opt_kakasi) {
1235         $conf::WAKATI = $conf::KAKASI;
1236         $var::Opt{'noun'} = 0;
1237     }
1238     if ($opt_mecab) {
1239         $conf::WAKATI = $conf::MECAB;
1240         $var::Opt{'noun'} = 0;
1241     }
1242     if ($include_file) {
1243         do $include_file;
1244         util::dprint("Included: $include_file\n");
1245     }
1246     if ($target_list) {
1247         if ($CheckPoint{'continue'}) {
1248             @targets = ("dummy");
1249         } else {
1250             @targets = load_target_list($target_list);
1251             util::dprint(_("Loaded: ")."$target_list\n");
1252         }
1253         $targets_loaded = 1;
1254     }
1255     if ($opt_checkpoint_sub) {
1256         $CheckPoint{'on'}           = 1;
1257         $CheckPoint{'continue'}     = 1;
1258         @argv = grep {! /^-S$/} @argv;  # remove -S
1259     }
1260
1261     if (defined $ReplaceCode) {
1262         my $orig = "/foo/bar/baz/quux.html";
1263         $_ = $orig;
1264         eval $ReplaceCode;
1265         if ($@) {  # eval error
1266             util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
1267         }
1268         util::dprint(_("Replace: ")."$orig -> $_\n");
1269     }
1270
1271     if ($opt_show_config) {
1272         show_config();
1273         exit 1;
1274     }
1275
1276     if (@ARGV == 0 && $targets_loaded == 0) {
1277         show_mini_usage();
1278         exit 1;
1279     }
1280
1281     $output_dir = $cwd unless defined $output_dir;
1282     util::cdie("$output_dir: "._("invalid output directory\n"))
1283         unless (-d $output_dir && -w $output_dir);
1284
1285     if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1286         util::win32_yen_to_slash(\$output_dir);
1287     }
1288
1289     # take remaining @ARGV as targets
1290     if (@ARGV > 0 && $targets_loaded == 0) {
1291         @targets = @ARGV ;
1292     }
1293
1294     # revert @ARGV
1295     # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
1296     @ARGV = @argv;
1297
1298     return ($output_dir, @targets);
1299 }
1300
1301 sub show_config () {
1302     print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
1303     print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
1304     print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
1305     print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
1306     print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
1307     print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
1308     print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
1309     print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
1310     print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
1311     print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
1312     print _("Lang_Msg: ") . "$util::LANG_MSG\n";
1313     print _("Lang: ") . "$util::LANG\n";
1314     print _("Coding System: ") . "$CodingSystem\n";
1315     print _("CONFDIR: ") . "$CONFDIR\n";
1316     print _("LIBDIR: ") . "$LIBDIR\n";
1317     print _("FILTERDIR: ") . "$FILTERDIR\n";
1318     print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
1319
1320     my @all_types =     keys %var::Supported;
1321     my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
1322
1323     my $num_supported = @supported;
1324     my $num_unsupported = @all_types - @supported;
1325     print _("Supported media types:   ") . "($num_supported)\n";
1326     print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
1327     for my $mtype (sort keys %var::Supported) {
1328         my $yn = $var::Supported{$mtype};
1329         if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
1330         print "$yn $mtype";
1331         if ($var::REQUIRE_ACTIONS{$mtype}){
1332             print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
1333         }
1334         print "\n";
1335     }
1336 }
1337
1338 sub load_target_list ($) {
1339     my ($file) = @_;
1340     my $fh_targets = util::efopen($file);
1341     my @targets = <$fh_targets>;
1342     util::fclose($fh_targets);
1343     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1344         foreach my $tmp (@targets){
1345             $tmp =~ s/\r//g;
1346             util::win32_yen_to_slash(\$tmp);
1347         }
1348     }
1349     chomp @targets;
1350     return @targets;
1351 }
1352
1353 # convert a relative path into an absolute path
1354 sub absolute_path($$) {
1355     my ($cwd, $path) = @_;
1356
1357     $path =~ s!^\.$!\./!;
1358     $path =~ s!^\.[/\\]!$cwd/!;
1359     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1360         util::win32_yen_to_slash(\$path);
1361         if ($path =~ m!^//!) {
1362         } elsif ($path =~ m!^/[^/]!) {
1363             my $driveletter = $cwd;
1364             if ($driveletter =~ m!^([A-Z]:)!i){
1365                 $driveletter = $1;
1366             }
1367             $path = "$driveletter$path";
1368         } elsif ($path !~ m!^[A-Z]:/!i) {
1369             $path = "$cwd/$path";
1370         }
1371     } else {
1372         $path =~ s!^([^/])!$cwd/$1!;
1373     }
1374     return $path;
1375 }
1376
1377 sub find_target (@) {
1378     my @targets = @_;
1379
1380     my $cwd = cwd();
1381     @targets = map { absolute_path($cwd, $_) } @targets;
1382
1383     # Convert \ to / with consideration for Shift_JIS encoding.
1384     if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1385         foreach my $tmp (@targets){
1386             util::win32_yen_to_slash(\$tmp);
1387         }
1388     }
1389
1390     # For reporting effects of --allow, --deny, --exclude, --mtime
1391     # options in --verbose mode.
1392     my %counts = ();
1393     $counts{'possible'} = 0;
1394     $counts{'excluded'} = 0;
1395     $counts{'too_old'} = 0;
1396     $counts{'too_new'} = 0;
1397     $counts{'not_allowed'} = 0;
1398     $counts{'denied'} = 0;
1399
1400     # Traverse directories.
1401     # This routine is not efficent but I prefer reliable logic.
1402     my @flist = ();
1403     my $start = time();
1404     util::vprint(_("find_target starting: "). localtime($start). "\n");
1405     while (@targets) {
1406         my $target = shift @targets;
1407
1408         if ($target eq '') {
1409             print STDERR "Warning: target contains empty line, skip it\n";
1410             next;
1411         }
1412
1413         if (-f $target) { # target is a file.
1414             add_target($target, \@flist, \%counts);
1415         } elsif (-d $target) { # target is a directory.
1416             my @subtargets = ();
1417             # Find subdirectories in target directory
1418             # because File::Find::find() does not follow symlink.
1419             if (-l $target) {
1420                 my $dh = new DirHandle($target);
1421                 while (defined(my $ent = $dh->read)) {
1422                     next if ($ent =~ /^\.{1,2}$/);
1423                     if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1424                         next if ($ent =~ m!^($conf::DENY_DDN)$!i);
1425                         my $tmp = $ent;
1426                         util::win32_yen_to_slash(\$tmp);
1427                         next if ($ent ne $tmp);
1428                     }
1429                     my $fname = "$target/$ent";
1430                     next if ($fname eq '.' || $fname eq '..');
1431                     if (-d $fname) {
1432                         push(@subtargets, $fname);
1433                     } else {
1434                         add_target($fname, \@flist, \%counts);
1435                     }
1436                 }
1437             } else {
1438                 @subtargets = ($target);
1439             }
1440
1441             #
1442             # Wanted routine for File::Find's find().
1443             #
1444             my $wanted_closure = sub {
1445                 my $fname = "$File::Find::dir/$_";
1446                 add_target($fname, \@flist, \%counts);
1447             };
1448
1449             find($wanted_closure, @subtargets) if (@subtargets > 0);
1450         } else {
1451             print STDERR _("unsupported target: ") . $target;
1452         }
1453     }
1454
1455     # uniq @flist
1456     my %mark = ();
1457     @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
1458
1459     # Sort file names with consideration for numbers.
1460     @flist = map  { $_->[0] }
1461              sort { $a->[1] cmp $b->[1] }
1462              map  { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
1463                     [ $_, $tmp ] } @flist;
1464
1465     my $elapsed = time() - $start ;
1466     $elapsed += 1 ;   # to round up and avoid 0
1467
1468     # For --verbose option.
1469     report_find_target($elapsed, $#flist + 1, %counts);
1470
1471     return @flist;
1472 }
1473
1474 sub add_target ($\@\%) {
1475     my ($target, $flists_ref, $counts_ref) = @_;
1476
1477     if ($target =~ /[\n\r\t]/) {
1478         $target =~ s/[\n\r\t]//g;
1479         print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
1480         return;   # skip a file name containing LF/CR/TAB chars.
1481     }
1482
1483     return unless -f $target;  # Only file is targeted.
1484
1485     $counts_ref->{'possible'}++;
1486
1487     unless (util::canopen($target)) {
1488         util::vprint(sprintf(_("Unreadable:     %s"), $target));
1489         $counts_ref->{'excluded'}++;
1490         return;
1491     }
1492
1493
1494     if (defined $conf::EXCLUDE_PATH &&
1495         $target =~ /$conf::EXCLUDE_PATH/ )
1496     {
1497         util::vprint(sprintf(_("Excluded:       %s"), $target));
1498         $counts_ref->{'excluded'}++;
1499         return;
1500     }
1501
1502     #
1503     # Do processing just like find's  --mtime option.
1504     #
1505     if (defined $var::Opt{'mtime'}) {
1506         my $mtime = -M $_;
1507         if ($var::Opt{'mtime'} < 0) {
1508
1509             # This must be `>=' not `>' for consistency with find(1).
1510             if (int($mtime) >= - $var::Opt{'mtime'}) {
1511                 util::vprint(sprintf(_("Too old:        %s"), $target));
1512                 $counts_ref->{'too_old'}++;
1513                 return;
1514             }
1515         } elsif ($var::Opt{'mtime'} > 0) {
1516             if ($var::Opt{'mtime'} =~ /^\+/) {
1517                 if ((int($mtime) < $var::Opt{'mtime'})) {
1518                     util::vprint(sprintf(_("Too new:    %s"), $target));
1519                     $counts_ref->{'too_new'}++;
1520                     return;
1521                 }
1522             } else {
1523                 if (int($mtime) != $var::Opt{'mtime'}) {
1524                     if (int($mtime) > $var::Opt{'mtime'}) {
1525                         util::vprint(sprintf(_("Too old:        %s"),$target));
1526                         $counts_ref->{'too_old'}++;
1527                     } else {
1528                         util::vprint(sprintf(_("Too new:        %s"),$target));
1529                         $counts_ref->{'too_new'}++;
1530                     }
1531                     return;
1532                 }
1533             }
1534         } else {
1535             # $var::Opt{'mtime'} == 0 ;
1536             return;
1537         }
1538     }
1539
1540     # Extract the file name of the target.
1541     $target =~ m!^.*/([^/]+)$!;
1542     my $fname = $1;
1543
1544     if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
1545         util::vprint(sprintf(_("Denied: %s"), $target));
1546         $counts_ref->{'denied'}++;
1547         return;
1548     }
1549     if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
1550         util::vprint(sprintf(_("Not allowed:    %s"), $target));
1551         $counts_ref->{'not_allowed'}++;
1552         return;
1553     } else{
1554         util::vprint(sprintf(_("Targeted:       %s"), $target));
1555         push @$flists_ref, $target;
1556     }
1557
1558 }
1559
1560 sub report_find_target ($$%) {
1561     my ($elapsed, $num_targeted, %counts) = @_;
1562
1563     util::vprint(_("find_target finished: ") . localtime(time()). "\n");
1564     util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
1565                          $num_targeted, $elapsed,
1566                          $num_targeted /$elapsed));
1567     util::vprint(sprintf(_("  Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
1568                          $counts{'possible'},
1569                          $counts{'not_allowed'},
1570                          $counts{'denied'},
1571                          $counts{'excluded'}));
1572     util::vprint(sprintf(_("  MTIME too old: %d, MTIME too new: %d"),
1573                          $counts{'too_old'},
1574                          $counts{'too_new'}));
1575 }
1576
1577 sub show_usage () {
1578     util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
1579     util::dprint(_("lang: ")."$util::LANG\n");
1580
1581     my $usage = $usage::USAGE;
1582     $usage = _($usage);
1583     printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
1584 }
1585
1586 sub show_mini_usage () {
1587     print _("Usage: mknmz [options] <target>...\n");
1588     print _("Try `mknmz --help' for more information.\n");
1589 }
1590
1591 sub show_version () {
1592     print $usage::VERSION_INFO;
1593 }
1594
1595 #
1596 # check the file. No $msg is good.
1597 #
1598 sub check_file ($$$$$) {
1599     my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
1600
1601     my $msg = undef;
1602     if ($mtype =~ /; x-system=unsupported$/) {
1603         $mtype =~ s/; x-system=unsupported$//;
1604         $msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
1605     } elsif ($mtype =~ /; x-error=file_size_max/) {
1606         $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1607     } elsif ($mtype =~ /; x-error=.*$/) {
1608         $mtype =~ s/^.*; x-error=(.*)$/$1/;
1609         $msg = $mtype;
1610     } elsif ($mtype =~ /^x-system/) {
1611         $msg = _("system error occurred! ")."($mtype)"._(" skipped.");
1612     } elsif (! -e $cfile) {
1613         $msg = _("does NOT EXIST! skipped.");
1614     } elsif (! util::canopen($cfile)) {
1615         $msg = _("is NOT READABLE! skipped.");
1616     } elsif ($text_size == 0 || $cfile_size == 0) {
1617         $msg = _("is 0 size! skipped.");
1618     } elsif ($mtype =~ /^application\/octet-stream/) {
1619         $msg = _("may be a BINARY file! skipped.");
1620     } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
1621         $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1622     } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
1623         $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
1624     }
1625
1626     return $msg;
1627 }
1628
1629
1630 #
1631 # Write NMZ.version file.
1632 #
1633 sub write_version() {
1634     unless (-f $var::NMZ{'version'}) {
1635         my $fh = util::efopen(">$var::NMZ{'version'}");
1636         print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
1637         util::fclose($fh);
1638     }
1639 }
1640
1641 #
1642 # rename each temporary file to a real file name.
1643 #
1644 sub write_nmz_files () {
1645     util::Rename($var::NMZ{'_i'},   $var::NMZ{'i'});
1646     util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
1647     util::Rename($var::NMZ{'_w'},  $var::NMZ{'w'});
1648     util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
1649     util::Rename($var::NMZ{'_p'},  $var::NMZ{'p'});
1650     util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
1651 }
1652
1653 # output NMZ.body
1654 sub write_body_msg () {
1655     for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
1656         if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
1657             my $suffix = $1 ? $1 : '';
1658             write_message("$var::NMZ{'body'}${suffix}");
1659         }
1660     }
1661 }
1662
1663 # output NMZ.tips
1664 sub write_tips_msg () {
1665     for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
1666         if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
1667             my $suffix = $1 ? $1 : '';
1668             write_message("$var::NMZ{'tips'}${suffix}");
1669         }
1670     }
1671 }
1672
1673
1674 # output NMZ.result.*
1675 sub write_result_file () {
1676     my $fname = "NMZ.result.normal";
1677
1678     my @files = glob "$TEMPLATEDIR/NMZ.result.*";
1679
1680     for my $file (@files) {
1681         $file =~ m!(NMZ\.result\.[^/]*)$!;
1682         my $target = "$var::OUTPUT_DIR/$1";
1683         if (-f $target) {  # already exist;
1684             next;
1685         } else {
1686             my $buf = util::readfile($file);
1687             my $fh_file = util::efopen(">$target");
1688             print $fh_file $buf;
1689             util::fclose($fh_file);
1690         }
1691     }
1692 }
1693
1694 # write NMZ.body and etc.
1695 sub write_message ($) {
1696     my ($msgfile) = @_;
1697
1698     if (! -f $msgfile) {
1699         my ($template, $fname);
1700
1701         $msgfile =~ m!.*/(.*)$!;
1702         $fname = $1;
1703         $template = "$TEMPLATEDIR/$fname";
1704
1705         if (-f $template) {
1706             my $buf = util::readfile($template);
1707             my $fh_output = util::efopen(">$msgfile");
1708             print $fh_output $buf;
1709             util::fclose($fh_output);
1710         }
1711     }
1712 }
1713
1714
1715 #
1716 # Make the NMZ.slog file for logging.
1717 #
1718 sub make_slog_file () {
1719     if (! -f $var::NMZ{'slog'}) {
1720         my $fh = util::efopen(">$var::NMZ{'slog'}");
1721         util::fclose($fh);
1722         undef $fh;
1723         chmod 0666, $var::NMZ{'slog'};
1724     }
1725     {
1726         my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
1727         util::fclose($fh_slogfile);
1728     }
1729 }
1730
1731
1732 #
1733 # Concatenate $CURRENTDIR to the head of each file.
1734 #
1735 sub change_filenames ($) {
1736     my $dir = $var::OUTPUT_DIR;
1737
1738     for my $key (sort keys %var::NMZ) {
1739         next if $key =~ /^_/;    # exclude temporary file
1740         $var::NMZ{$key} = "$dir/$var::NMZ{$key}";
1741     }
1742
1743     # temporary files
1744     for my $key (sort keys %var::NMZ) {
1745         if ($key =~ /^_/) {
1746             $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
1747         }
1748     }
1749
1750     if ($var::Opt{'debug'}) {
1751         for my $key (sort keys %var::NMZ) {
1752             util::dprint("NMZ: $var::NMZ{$key}\n");
1753         }
1754     }
1755 }
1756
1757
1758 #
1759 # Preparation processing for appending index files.
1760 #
1761 sub append_index (@) {
1762     my @flist = @_;
1763
1764     my $docid_base = 0;
1765     ($docid_base, @flist) = set_target_files(@flist);
1766
1767     unless (@flist) {   # if @flist is empty
1768         if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1769             set_lockfile($var::NMZ{'lock2'});
1770             update_dateindex();
1771             update_registry(0);
1772             make_headfoot_pages(0, get_total_keys());
1773             put_log(0, 0, 0, get_total_keys());
1774             make_headfoot_pages(get_status("files"), get_status("keys"));
1775             util::remove_tmpfiles();
1776         }
1777         print _("No files to index.\n");
1778         exit 0;
1779     }
1780
1781     $APPENDMODE = 1;
1782     # conserve files by copying
1783     copy($var::NMZ{'i'},  $var::NMZ{'_i'});
1784     copy($var::NMZ{'w'},  $var::NMZ{'_w'});
1785     copy($var::NMZ{'t'},  $var::NMZ{'_t'})
1786         unless -f $var::NMZ{'_t'}; # preupdated ?
1787     copy($var::NMZ{'p'},  $var::NMZ{'_p'});
1788     copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
1789
1790     return ($docid_base, @flist);
1791 }
1792
1793 #
1794 # Set target files to @flist and return with the number of regiested files.
1795 #
1796 sub set_target_files() {
1797     my %rdocs;    # 'rdocs' means 'registered documents'
1798     my @found_files = @_;
1799
1800     # Load the list of registered documents
1801     $rdocs{'name'} = load_registry();
1802
1803     # Pick up overlapped documents and do marking
1804     my %mark1;
1805     my @overlapped_files;
1806     grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
1807     $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
1808     for my $overlapped (grep { $mark1{$_} } @found_files) {
1809         $rdocs{'overlapped'}{$overlapped} = 1;
1810         push @overlapped_files, $overlapped;
1811     };
1812
1813     # Pick up not overlapped documents which are files to index.
1814     my @flist = grep { ! $mark1{$_} } @found_files;
1815
1816     if ($var::Opt{'noupdate'}) {
1817         return (scalar @{$rdocs{'name'}}, @flist);
1818     };
1819
1820     # Load the date index.
1821     $rdocs{'mtime'} = load_dateindex();
1822
1823     if (@{$rdocs{'mtime'}} == 0) {
1824         return (scalar @{$rdocs{'name'}}, @flist);
1825     };
1826
1827     util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
1828                  "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
1829
1830     # Pick up deleted documents and do marking
1831     # (registered in the NMZ.r but not existent in the filesystem)
1832     my @deleted_documents;
1833     unless ($var::Opt{'nodelete'}) {
1834         my %mark2;
1835         grep { $mark2{$_}++ } @found_files;
1836         for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
1837                                 ! $rdocs{'overlapped'}{$_} }
1838                          @{$rdocs{'name'}})
1839         {
1840             $rdocs{'deleted'}{$deleted} = 1;
1841             push @deleted_documents, $deleted;
1842         }
1843     }
1844
1845     # check filesize
1846     if ($var::Opt{'checkfilesize'}) {
1847         $rdocs{'size'} = load_sizefield();
1848     }
1849
1850     # Pick up updated documents and set the missing number for deleted files.
1851     my @updated_documents = pickup_updated_documents(\%rdocs);
1852
1853     # Append updated files to the list of files to index.
1854     if (@updated_documents) {
1855         push @flist, @updated_documents;
1856     }
1857
1858     # Remove duplicates.
1859     my %seen = ();
1860     @flist = grep { ! $seen{$_}++ } @flist;
1861
1862     util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
1863     util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
1864     util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
1865     util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
1866     util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
1867     util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
1868
1869     # Update NMZ.t with the missing number infomation and
1870     # append updated files and deleted files to NMZ.r with leading '# '
1871     if (@updated_documents || @deleted_documents) {
1872         $DeletedFilesCount = 0;
1873         $UpdatedFilesCount = 0;
1874         $UpdatedFilesCount += @updated_documents;
1875 #       $DeletedFilesCount += @updated_documents;
1876         $DeletedFilesCount += @deleted_documents;
1877         preupdate_dateindex(@{$rdocs{'mtime'}});
1878         preupdate_registry(@updated_documents, @deleted_documents);
1879     }
1880
1881     # Return the number of registered documents and list of files to index.
1882     return (scalar @{$rdocs{'name'}}, @flist);
1883 }
1884
1885 sub preupdate_registry(@) {
1886     my (@list) = @_;
1887
1888     my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
1889     @list = grep { s/(.*)/\# $1\n/ } @list;
1890     print $fh_registry @list;
1891     print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
1892     util::fclose($fh_registry);
1893 }
1894
1895 sub preupdate_dateindex(@) {
1896     my @mtimes = @_;
1897
1898     # Since rewriting the entire file, it is not efficient,
1899     # but simple and reliable. this would be revised in the future.
1900     my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
1901 #    print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
1902     print $fh_dateindex pack("N*", @mtimes);
1903     util::fclose($fh_dateindex);
1904 }
1905
1906 sub update_registry ($) {
1907     my ($docid_count) = @_;
1908
1909     {
1910         my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
1911         my $fh_registry_ = util::efopen($var::NMZ{'_r'});
1912         while (defined(my $line = <$fh_registry_>)) {
1913             print $fh_registry $line;
1914         }
1915         if ($docid_count > 0) {
1916             print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
1917         }
1918         util::fclose($fh_registry_) if (defined $fh_registry_);
1919         util::fclose($fh_registry);
1920     }
1921     unlink $var::NMZ{'_r'};
1922 }
1923
1924 sub update_dateindex () {
1925     util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
1926 }
1927
1928 sub update_field_index () {
1929     my @list = glob "$var::NMZ{'field'}.*.tmp";
1930     for my $tmp (@list) {
1931         if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
1932             my $fname_tmp = $1;
1933             my $fname_out = $2;
1934             {
1935                 my $fh_field = util::efopen(">>$fname_out");
1936                 my $fh_tmp = util::efopen($fname_tmp);
1937
1938                 while (defined(my $line = <$fh_tmp>)) {
1939                     print $fh_field $line;
1940                 }
1941                 util::fclose($fh_tmp) if (defined $fh_tmp);
1942                 util::fclose($fh_field);
1943             }
1944             unlink $fname_tmp;
1945         } else {
1946             util::cdie(_("update_field_index: ")."@list");
1947         }
1948     }
1949 }
1950
1951 sub pickup_updated_documents (\%) {
1952     my ($rdocs_ref) = @_;
1953     my @updated_documents = ();
1954
1955     # To avoid duplicated outputs caused by --html-split support.
1956     my %printed = ();
1957     my $i = 0;
1958     for my $cfile (@{$rdocs_ref->{'name'}}) {
1959         if (defined($rdocs_ref->{'deleted'}{$cfile})) {
1960             unless ($printed{$cfile}) {
1961                 print "$cfile " . _("was deleted!\n");
1962                 $printed{$cfile} = 1;
1963             }
1964             $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1965         } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
1966             my $cfile_mtime = (stat($cfile))[9];
1967             my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
1968             my ($cfile_size, $rfile_size);
1969             if ($var::Opt{'checkfilesize'}) {
1970                 $cfile_size = (stat($cfile))[7];
1971                 $rfile_size = $rdocs_ref->{'size'}[$i];
1972             }
1973
1974             if ($rfile_mtime != $cfile_mtime ||
1975                 ($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
1976                 # The file is updated!
1977                 unless ($printed{$cfile}) {
1978                     print "$cfile " . _("was updated!\n");
1979                     $printed{$cfile} = 1;
1980                 }
1981                 push(@updated_documents, $cfile);
1982                 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1983             }
1984         }
1985         $i++;
1986     }
1987
1988     return @updated_documents
1989 }
1990
1991 sub load_dateindex() {
1992     my $fh_dateindex = util::efopen($var::NMZ{'t'});
1993
1994     my $size = -s $var::NMZ{'t'};
1995     my $buf  = "";
1996     read($fh_dateindex, $buf, $size);
1997     my @list = unpack("N*", $buf);  # load date index
1998 #    print "\nload_dateindex\n", join("\n", @list), "\n\n";
1999
2000     util::fclose($fh_dateindex);
2001     return [ @list ];
2002 }
2003
2004 sub load_registry () {
2005     my $fh_registry = util::efopen($var::NMZ{'r'});
2006
2007     my @list = ();
2008     my %deleted    = ();
2009     my @registered = ();
2010
2011     while (defined(my $line = <$fh_registry>)) {
2012         chomp($line);
2013         next if $line =~ /^\s*$/;  # an empty line
2014         next if $line =~ /^##/;    # a comment
2015         if ($line =~ s/^\#\s+//) { # deleted document
2016             $deleted{$line}++;
2017         } else {
2018             # Remove HTML's anchor generated by --html-split option.
2019             $line =~ s/\t.*$//g;
2020             push @registered, $line;
2021         }
2022     }
2023
2024     util::fclose($fh_registry) if (defined $fh_registry);
2025
2026     # Exclude deleted documents.
2027     for my $doc (@registered) {
2028         if ($deleted{$doc}) {
2029             push @list, "# $doc";
2030             $deleted{$doc}--;
2031         } else {
2032             push @list, $doc;
2033         }
2034     }
2035
2036     return [ @list ];
2037 }
2038
2039 # get file size information from NMZ.field.size
2040 sub load_sizefield() {
2041     my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
2042     return [] unless defined $fh_sizefield;
2043     my $line;
2044     my @ret = ();
2045     while (defined($line = <$fh_sizefield>)) {
2046         chomp $line;
2047         push @ret, $line;
2048     }
2049     util::fclose($fh_sizefield) if (defined $fh_sizefield);
2050     return \@ret;
2051 }
2052
2053 sub get_total_keys() {
2054     my $keys = get_status("keys");
2055     $keys =~ s/,//g if (defined $keys);
2056     $keys = 0 unless defined $keys;
2057     return $keys;
2058 }
2059
2060 sub get_total_files() {
2061     my $files = get_status("files");
2062     $files =~ s/,//g if (defined $files);
2063     $files = 0 unless defined $files;
2064     return $files;
2065 }
2066
2067 sub get_status($) {
2068     my ($key) = @_;
2069
2070     my $fh = util::fopen($var::NMZ{'status'});
2071     return undef unless defined $fh;
2072
2073     while (defined(my $line = <$fh>)) {
2074         if ($line =~ /^$key\s+(.*)$/) {
2075             util::dprint("status: $key = $1\n");
2076             $fh->close;
2077             return $1;
2078         }
2079     }
2080     util::fclose($fh) if (defined $fh);
2081     return undef;
2082 }
2083
2084 sub put_total_files($) {
2085     my ($number) = @_;
2086     $number =~ tr/,//d;
2087     put_status("files", $number);
2088 }
2089
2090 sub put_total_keys($) {
2091     my ($number) = @_;
2092     $number =~ tr/,//d;
2093     put_status("keys", $number);
2094 }
2095
2096 sub put_status($$) {
2097     my ($key, $value) = @_;
2098
2099     # remove NMZ.status file if the file has a previous value.
2100     unlink $var::NMZ{'status'} if defined get_status($key);
2101
2102     my $fh = util::efopen(">> $var::NMZ{'status'}");
2103     print $fh "$key $value\n";
2104     util::fclose($fh);
2105 }
2106
2107 # do logging
2108 sub put_log ($$$$) {
2109     my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
2110
2111     my $date = localtime;
2112     my $added_files_count   = $docid_count;
2113     my $deleted_documents_count = $DeletedFilesCount;
2114     my $updated_documents_count = $UpdatedFilesCount;
2115     my $total_files_count   = get_total_files() + $docid_count
2116                               - $DeletedFilesCount - $UpdatedFilesCount;
2117     my $added_keys_count    = 0;
2118     $added_keys_count       = $total_keys_count - get_total_keys();
2119
2120     my $processtime         = time - $start_time;
2121     $processtime            = 0 if $start_time == 0;
2122     $total_files_size       = $total_files_size;
2123     $total_keys_count       = $total_keys_count;
2124
2125     my @logmsgs = ();
2126     if ($APPENDMODE) {
2127         push @logmsgs, N_("[Append]");
2128     } else {
2129         push @logmsgs, N_("[Base]");
2130     }
2131     push @logmsgs, N_("Date:"), "$date" if $date;
2132     push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
2133         if $added_files_count;
2134     push @logmsgs, N_("Deleted Documents:"),
2135         util::commas("$deleted_documents_count") if $deleted_documents_count;
2136     push @logmsgs, N_("Updated Documents:"),
2137         util::commas("$updated_documents_count") if $updated_documents_count;
2138     push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
2139         if $total_files_size;
2140     push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count")
2141         if $total_files_count;
2142     push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count")
2143         if $added_keys_count;
2144     push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count")
2145         if $total_keys_count;
2146     push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
2147     push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
2148         if $processtime;
2149     push @logmsgs, N_("File/Sec:"),  sprintf "%.2f",
2150         (($added_files_count + $updated_documents_count) / $processtime)
2151         if $processtime;
2152     push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
2153     push @logmsgs, N_("Perl:"),   sprintf("%f", $English::PERL_VERSION);
2154     push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
2155
2156     my $log_for_file = "";
2157
2158     my $msg = shift @logmsgs;   # [Base] or [Append]
2159     # To stdout, use gettext.
2160     print _($msg), "\n";
2161     # To log file, do not use gettext.
2162     $log_for_file = $msg . "\n";
2163     while (@logmsgs) {
2164         my $field = shift @logmsgs;
2165         my $value = shift @logmsgs;
2166         printf "%-20s %s\n", _($field), "$value";
2167         $log_for_file .= sprintf "%-20s %s\n", $field, "$value";
2168     }
2169     print "\n";
2170     $log_for_file .= "\n";
2171
2172     put_log_to_logfile($log_for_file);
2173     put_total_files($total_files_count);
2174     put_total_keys($total_keys_count);
2175
2176     my $argv = join "\t", @ARGV;
2177     my $cwd  = cwd();
2178     put_status("argv", $argv);
2179     put_status("cwd",  $cwd);
2180 }
2181
2182 sub put_log_to_logfile ($) {
2183     my ($logmsg) = @_;
2184     my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
2185     print $fh_logfile $logmsg;
2186     util::fclose($fh_logfile);
2187 }
2188
2189 sub get_year() {
2190     my $year = (localtime)[5] + 1900;
2191
2192     return $year;
2193 }
2194
2195 # Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
2196 # Insert $docid_count, $key_count, and $month/$day/$year respectively.
2197 sub make_headfoot ($$$) {
2198     my ($file, $docid_count, $key_count) = @_;
2199
2200     my $day   = sprintf("%02d", (localtime)[3]);
2201     my $month = sprintf("%02d", (localtime)[4] + 1);
2202     my $year  = get_year();
2203     my $buf   = "";
2204
2205     if (-f $file) {
2206         $buf = util::readfile($file);
2207     } else {
2208         $file =~ m!.*/(.*)$!;
2209         my $fname = $1;
2210         my $template = "$TEMPLATEDIR/$fname";
2211
2212         if (-f $template) {
2213             $buf = util::readfile($template);
2214         } else {
2215             return;
2216         }
2217     }
2218
2219     my $fh_file = util::efopen(">$file");
2220
2221     if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
2222         my $total_files_count = util::commas(get_total_files() + $docid_count
2223                                    - $DeletedFilesCount - $UpdatedFilesCount);
2224         $buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
2225
2226     }
2227     if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
2228         my $tmp = $2;
2229         $tmp =~ tr/,//d;
2230         $tmp = $key_count;
2231         $tmp = util::commas($tmp);
2232         $buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
2233     }
2234     my $index_dir = basename($var::OUTPUT_DIR);
2235     $buf =~ s#<!-- INDEX_DIR -->#$index_dir#gs;
2236     $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
2237     $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
2238     $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
2239              {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
2240     $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
2241              {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
2242
2243     print $fh_file $buf;
2244     util::fclose($fh_file);
2245 }
2246
2247 # Make phrase hashes for NMZ.p
2248 # Handle two words each for calculating a hash value ranged 0-65535.
2249 sub make_phrase_hash ($$$) {
2250     my ($docid_count, $docid_base, $contref) = @_;
2251
2252     my %tmp = ();
2253     $$contref =~ s!\x7f */? *\d+ *\x7f!!g;  # remove tags of weight
2254     $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
2255     my @words = split(/\s+/, $$contref);
2256     @words = grep {$_ ne ""} @words;   # remove empty words
2257     my $word_b = shift @words;
2258     my $docid = $docid_count + $docid_base;
2259     for my $word (@words) {
2260         next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2261         my $hash = hash($word_b . $word);
2262         unless (defined $tmp{$hash}) {
2263             $tmp{$hash} = 1;
2264             $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
2265             $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
2266 #           util::dprint("<$word_b, $word> $hash\n");
2267             $PhraseHashLast{$hash} = $docid;
2268         }
2269         $word_b = $word;
2270     }
2271 }
2272
2273 # Construct NMZ.p and NMZ.pi file. this processing is rather complex.
2274 sub write_phrase_hash () {
2275     write_phrase_hash_sub();
2276     util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
2277     util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
2278 }
2279
2280 sub write_phrase_hash_sub () {
2281     my $opened = 0;
2282
2283     return 0 if %PhraseHash eq '';    # namazu-devel-ja #3146
2284     util::dprint(_("doing write_phrase_hash() processing.\n"));
2285
2286     my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
2287     my $fh_tmp_p  = util::efopen(">$var::NMZ{'__p'}");
2288
2289     my $fh_phrase = util::fopen($var::NMZ{'_p'});
2290     my $fh_phraseindex = undef;
2291     if ($fh_phrase) {
2292         $fh_phraseindex = util::efopen($var::NMZ{'_pi'});
2293         $opened = 1;
2294     }
2295
2296     my $ptr = 0;
2297     for (my $i = 0; $i < 65536; $i++) {
2298
2299         my $baserecord = "";
2300         my $baseleng = 0;
2301
2302         if ($opened) {
2303             my $tmp = 0;
2304             read($fh_phraseindex, $tmp, $var::INTSIZE);
2305             $tmp = unpack("N", $tmp);
2306             if ($tmp != 0xffffffff) { # 0xffffffff
2307                 $baseleng = readw($fh_phrase);
2308                 read($fh_phrase, $baserecord, $baseleng);
2309             }
2310         }
2311         if (defined($PhraseHash{$i})) {
2312             if ($baserecord eq "") {
2313                 print $fh_tmp_pi pack("N", $ptr);
2314                 my $record = $PhraseHash{$i};
2315                 my $n2 = length($record);
2316                 my $data = pack("w", $n2) . $record;
2317                 print $fh_tmp_p $data;
2318                 $ptr += length($data);
2319             } else {
2320                 print $fh_tmp_pi pack("N", $ptr);
2321                 my $record = $PhraseHash{$i};
2322                 my $last_docid = get_last_docid($baserecord, 1);
2323                 my $adjrecord = adjust_first_docid($record, $last_docid);
2324                 check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
2325                 $record = $adjrecord;
2326                 my $n2 = length($record) + $baseleng;
2327                 my $data = pack("w", $n2) .  $baserecord . $record;
2328                 print $fh_tmp_p $data;
2329                 $ptr += length($data);
2330             }
2331         } else {
2332             if ($baserecord eq "") {
2333                 # if $baserecord has no data, set to 0xffffffff
2334                 print $fh_tmp_pi pack("N", 0xffffffff);
2335             } else {
2336                 print $fh_tmp_pi pack("N", $ptr);
2337                 my $data = pack("w", $baseleng) . $baserecord;
2338                 print $fh_tmp_p $data;
2339                 $ptr += length($data);
2340             }
2341         }
2342     }
2343
2344     if ($opened) {
2345         util::fclose($fh_phraseindex);
2346     }
2347     if (defined $fh_phrase) {
2348         util::fclose($fh_phrase);
2349     }
2350     util::fclose($fh_tmp_p);
2351     util::fclose($fh_tmp_pi);
2352
2353     %PhraseHash = ();
2354     %PhraseHashLast = ();
2355 }
2356
2357 # Dr. Knuth's  ``hash'' from (UNIX MAGAZINE May 1998)
2358 sub hash ($) {
2359     my ($word) = @_;
2360
2361     my $hash = 0;
2362     for (my $i = 0; $word ne ""; $i++) {
2363         $hash ^= $Seed[$i & 0x03][ord($word)];
2364         $word = substr $word, 1;
2365         # $word =~ s/^.//;  is slower
2366     }
2367     return $hash & 65535;
2368 }
2369
2370 # Count frequencies of words.
2371 sub count_words ($$$$) {
2372     my ($docid_count, $docid_base, $contref, $kanji) = @_;
2373     my (@tmp);
2374
2375     # Normalize into small letter.
2376     $$contref =~ tr/A-Z/a-z/;
2377
2378     # Remove control char.
2379     $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
2380
2381     # It corresponds to -j option of ChaSen.
2382     $$contref =~ s/^[ \t\f]+//gm;    # except "\r\n"
2383     $$contref =~ s/[ \t\f]+$//gm;    # except "\r\n"
2384     $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi;   # for hyphenation
2385     if (util::islang("ja")) {
2386         $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
2387         $$contref =~ s/(¡£|¡¢)/$1\n/gs;
2388     }
2389     $$contref =~ s/\n+/\n/gs;
2390
2391     # Do wakatigaki if necessary.
2392     if (util::islang("ja")) {
2393         wakati::wakatize_japanese($contref) if $kanji;
2394     }
2395
2396     my $part1 = "";
2397     my $part2 = "";
2398     if ($$contref =~ /\x7f/) {
2399         $part1 = substr $$contref, 0, index($$contref, "\x7f");
2400         $part2 = substr $$contref, index($$contref, "\x7f");
2401 #       $part1 = $PREMATCH;  # $& and friends are not efficient
2402 #       $part2 = $MATCH . $POSTMATCH;
2403     } else {
2404         $part1 = $$contref;
2405         $part2 = "";
2406     }
2407
2408     # do scoring
2409     my %word_count = ();
2410     $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
2411         wordcount_sub($2, $1, \%word_count)!ge;
2412     wordcount_sub($part1, 1, \%word_count);
2413
2414     # Add them to whole index
2415     my $docid = $docid_count + $docid_base;
2416     for my $word (keys(%word_count)) {
2417         next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2418         $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
2419         $KeyIndex{$word} .= pack("w2",
2420                                  $docid - $KeyIndexLast{$word},
2421                                  $word_count{$word});
2422         $KeyIndexLast{$word} = $docid;
2423     }
2424 }
2425
2426 #
2427 # Count words and do score weighting
2428 #
2429 sub wordcount_sub ($$\%) {
2430     my ($text, $weight, $word_count) = @_;
2431
2432     # Remove all symbols when -K option is specified.
2433     $text =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
2434
2435     # Count frequencies of words in a current document.
2436     # Handle symbols as follows.
2437     #
2438     # tcp/ip      ->  tcp/ip,     tcp,      ip
2439     # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
2440     # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
2441     #
2442     # Don't do processing for nested symbols.
2443     # NOTE: When -K is specified, all symbols are already removed.
2444
2445     my @words = split /\s+/, $text;
2446     for my $word (@words) {
2447         next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2448         if ($var::Opt{'noedgesymbol'}) {
2449             # remove symbols at both ends
2450             $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
2451         }
2452         $word_count->{$word} = 0 unless defined($word_count->{$word});
2453         $word_count->{$word} += $weight;
2454         unless ($var::Opt{'nosymbol'}) {
2455             if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
2456                 $word_count->{$1} = 0 unless defined($word_count->{$1});
2457                 $word_count->{$1} += $weight;
2458                 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2459             } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
2460                 $word_count->{$1} = 0 unless defined($word_count->{$1});
2461                 $word_count->{$1} += $weight;
2462                 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2463             } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
2464                 $word_count->{$1} = 0 unless defined($word_count->{$1});
2465                 $word_count->{$1} += $weight;
2466                 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2467             }
2468             my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
2469                 if $word =~ /[^\xa1-\xfea-z_0-9]/;
2470             for my $tmp (@words_) {
2471                 next if $tmp eq "";
2472                 $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
2473                 $word_count->{$tmp} += $weight;
2474             }
2475             @words_ = ();
2476         }
2477     }
2478     return "";
2479 }
2480
2481 # Construct NMZ.i and NMZ.ii file. this processing is rather complex.
2482 sub write_index () {
2483     my $key_count = write_index_sub();
2484     util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
2485     util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
2486
2487     return $key_count;
2488 }
2489
2490 # readw: read one pack 'w' word.
2491 # This code was contributed by <furukawa@tcp-ip.or.jp>.
2492 sub readw ($) {
2493     my $fh = shift;
2494     my $ret = '';
2495     my $c;
2496
2497     while (read($fh, $c, 1)){
2498         $ret .= $c;
2499         last unless 0x80 & ord $c;
2500     }
2501     return unpack('w', $ret);
2502 }
2503
2504 sub get_last_docid ($$) {
2505     my ($record, $step) = @_;
2506     my (@data) = unpack 'w*', $record;
2507
2508     my $sum = 0;
2509     for (my $i = 0; $i < @data; $i += $step) {
2510         $sum += $data[$i];
2511     }
2512     my $leng = @data / $step;
2513     return $sum;
2514 }
2515
2516 sub adjust_first_docid ($$) {
2517     my ($record, $last_docid) = @_;
2518     my (@data) = unpack 'w*', $record;
2519
2520     $data[0] = $data[0] - $last_docid;
2521     return undef if ($data[0] < 0); # namazu-bug-ja#31
2522     $record = pack 'w*', @data;
2523     return $record;
2524 }
2525
2526 sub write_index_sub () {
2527     my @words = sort keys(%KeyIndex);
2528     return 0 if $#words == -1;
2529
2530     my $cnt = 0;
2531     my $ptr_i = 0;
2532     my $ptr_w = 0;
2533     my $key_count = 0;
2534     my $baserecord = "";
2535
2536     util::dprint(_("doing write_index() processing.\n"));
2537     my $fh_tmp_i  = util::efopen(">$var::NMZ{'__i'}");
2538     my $fh_tmp_w  = util::efopen(">$var::NMZ{'__w'}");
2539     my $fh_i      = util::fopen($var::NMZ{'_i'});
2540     my $fh_ii     = util::efopen(">$var::NMZ{'_ii'}");
2541     my $fh_w      = util::fopen($var::NMZ{'_w'});
2542     my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
2543
2544     if ($fh_w) {
2545       FOO:
2546         while (defined(my $line = <$fh_w>)) {
2547             chop $line;
2548             my $current_word = $line;
2549
2550             my $baseleng = readw($fh_i);
2551             read($fh_i, $baserecord, $baseleng);
2552
2553             for (; $cnt < @words; $cnt++) {
2554                 last unless $words[$cnt] le $current_word;
2555                 my $record = $KeyIndex{$words[$cnt]};
2556                 my $leng = length($record);
2557
2558                 if ($current_word eq $words[$cnt]) {
2559                     my $last_docid = get_last_docid($baserecord, 2);
2560                     my $adjrecord = adjust_first_docid($record, $last_docid);
2561                     check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
2562                     $record = $adjrecord;
2563                     $leng = length($record);  # re-measure
2564                     my $tmp = pack("w", $leng + $baseleng);
2565
2566                     my $data_i = "$tmp$baserecord$record";
2567                     my $data_w = "$current_word\n";
2568                     print $fh_tmp_i $data_i;
2569                     print $fh_tmp_w $data_w;
2570                     print $fh_ii pack("N", $ptr_i);
2571                     print $fh_wi pack("N", $ptr_w);
2572                     $ptr_i += length($data_i);
2573                     $ptr_w += length($data_w);
2574                     $key_count++;
2575
2576                     $cnt++;
2577                     next FOO;
2578                 } else {
2579                     my $tmp = pack("w", $leng);
2580                     my $data_i = "$tmp$record";
2581                     my $data_w = "$words[$cnt]\n";
2582                     print $fh_tmp_i $data_i;
2583                     print $fh_tmp_w $data_w;
2584                     print $fh_ii pack("N", $ptr_i);
2585                     print $fh_wi pack("N", $ptr_w);
2586                     $ptr_i += length($data_i);
2587                     $ptr_w += length($data_w);
2588                     $key_count++;
2589                 }
2590             }
2591             my $tmp  = pack("w", $baseleng);
2592             my $data_i = "$tmp$baserecord";
2593             my $data_w = "$current_word\n";
2594             print $fh_tmp_i $data_i;
2595             print $fh_tmp_w $data_w;
2596             print $fh_ii pack("N", $ptr_i);
2597             print $fh_wi pack("N", $ptr_w);
2598             $ptr_i += length($data_i);
2599             $ptr_w += length($data_w);
2600             $key_count++;
2601         }
2602     }
2603     while ($cnt < @words) {
2604         my $leng = length($KeyIndex{$words[$cnt]});
2605         my $tmp = pack("w", $leng);
2606         my $record = $KeyIndex{$words[$cnt]};
2607
2608         my $data_i = "$tmp$record";
2609         my $data_w = "$words[$cnt]\n";
2610         print $fh_tmp_i $data_i;
2611         print $fh_tmp_w $data_w;
2612         print $fh_ii pack("N", $ptr_i);
2613         print $fh_wi pack("N", $ptr_w);
2614         $ptr_i += length($data_i);
2615         $ptr_w += length($data_w);
2616         $key_count++;
2617         $cnt++;
2618     }
2619     %KeyIndex = ();
2620     %KeyIndexLast = ();
2621
2622     util::fclose($fh_wi);
2623     util::fclose($fh_w) if (defined $fh_w);
2624     util::fclose($fh_ii);
2625     util::fclose($fh_i) if (defined $fh_i);
2626     util::fclose($fh_tmp_w);
2627     util::fclose($fh_tmp_i);
2628
2629     return $key_count;
2630 }
2631
2632 #
2633 # Decide the media type.
2634 # FIXME: Very ad hoc. It's just a compromise. -- satoru
2635 #
2636 sub decide_type ($$) {
2637     my ($name, $cont) = @_;
2638     return $name if (!defined $cont || $name eq $cont);
2639
2640     util::dprint("decide_type: name: $name, cont: $cont\n");
2641     if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
2642         return $name;
2643     } elsif ($cont =~ m!^application/octet-stream! &&
2644              $name !~ m!^text/!) {
2645         return $name;
2646     } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
2647              $name !~ m!^application/octet-stream!)  {
2648         # FIXME: Currently File::MMagic 1.02's checktype_data()
2649         # is unreliable for them.
2650         return $name;
2651     } elsif ($cont =~ m!^application/x-zip! &&
2652              $name =~ m!^application/!) {
2653         # zip format is used other applications e.g. OpenOffice.
2654         # It is necessary to add to check extention.
2655         return $name;
2656     }
2657
2658     return $cont;
2659 }
2660
2661 #
2662 # Debugging code for the "negative numbers" problem.
2663 #
2664 sub check_records ($$$) {
2665     my ($recref, $baserecref, $step) = @_;
2666     dump_record($baserecref, $step);
2667     dump_record($recref, $step);
2668     print STDERR "The \x22negative number\x22 problem occurred.\n";
2669     exit(1);
2670 }
2671
2672 sub dump_record($$) {
2673     my ($recref, $step) = @_;
2674     my (@data) = unpack 'w*', $$recref;
2675     print STDERR "dump record data to NMZ.bug.info (step: $step)...";
2676     my $fh_info = util::fopen(">> NMZ.bug.info");
2677     print $fh_info "dumped record data (step: $step)...";
2678     foreach (@data) {
2679         print $fh_info sprintf(" %08x", $_);
2680     }
2681     print $fh_info "\n";
2682     util::fclose($fh_info);
2683     return;
2684 }
2685
2686 sub trapintr {
2687     my ($signame) = @_;
2688     print STDERR "Warning: signal $signame occured.\n";
2689 }
2690
2691 #
2692 # For avoiding "used only once: possible typo at ..." warnings.
2693 #
2694 muda($conf::ON_MEMORY_MAX,
2695      $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
2696      $conf::DENY_FILE, $var::INTSIZE,
2697      $conf::CHASEN_NOUN, $conf::CHASEN,
2698      $conf::KAKASI, $var::Opt{'okurigana'},
2699      $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
2700      $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
2701      $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
2702      $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
2703      $conf::ADDRESS, $var::MAILING_ADDRESS,
2704      $conf::FILE_SIZE_MAX,
2705      $conf::MECAB,
2706      $conf::DENY_DDN,
2707      $var::TRAC_URI,
2708      );
2709
2710 sub muda {}
2711