--- /dev/null
+#! /usr/bin/perl -w
+# -*- Perl -*-
+# mknmz - indexer of Namazu
+# $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
+#
+# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
+# Copyright (C) 2000-2008 Namazu Project All rights reserved.
+# This is free software with ABSOLUTELY NO WARRANTY.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either versions 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA
+#
+# This file must be encoded in EUC-JP encoding
+#
+
+package mknmz;
+require 5.004;
+use English;
+use lib ".";
+use Cwd;
+use IO::File;
+use File::Find;
+use File::MMagic;
+use Time::Local;
+use strict; # be strict since v1.2.0
+use Getopt::Long;
+use File::Copy;
+use DirHandle;
+use File::Basename;
+
+use vars qw($SYSTEM);
+# It exists only for back compatibility.
+$SYSTEM = $English::OSNAME;
+
+my $NAMAZU_INDEX_VERSION = "2.0";
+
+my $CodingSystem = "euc";
+my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu";
+my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in.
+my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
+my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
+my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
+
+my $DeletedFilesCount = 0;
+my $UpdatedFilesCount = 0;
+my $APPENDMODE = 0;
+my %PhraseHash = ();
+my %PhraseHashLast = ();
+my %KeyIndex = ();
+my %KeyIndexLast = ();
+my %CheckPoint = ("on" => undef, "continue" => undef);
+my $ConfigFile = undef;
+my $MediaType = undef;
+
+my $ReplaceCode = undef; # perl code for transforming URI
+my @Seed = ();
+my @LoadedRcfiles = ();
+my $Magic = new File::MMagic;
+
+my $ReceiveTERM = 0;
+
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+main();
+sub main {
+ my $start_time = time;
+
+ if ($English::PERL_VERSION == 5.008001) {
+ unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
+ print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
+ exit 1;
+ }
+ }
+
+ init();
+
+ # At first, loading pl/conf.pl to prevent overriding some variables.
+ preload_modules();
+
+ # set LANG and bind textdomain
+ util::set_lang();
+ textdomain('namazu', $util::LANG_MSG);
+
+ load_modules();
+ my ($output_dir, @targets) = parse_options();
+ my ($docid_base, $total_files_num) = prep($output_dir, @targets);
+
+ my $swap = 1;
+ my $docid_count = 0;
+ my $file_count = 0;
+ my $total_files_size = 0;
+ my $key_count = 0;
+ my $checkpoint = 0;
+ my $flist_ptr = 0;
+ my $processed_files_size = 0;
+
+ if ($CheckPoint{'continue'}) {
+ # Restore variables
+ eval util::readfile($var::NMZ{'_checkpoint'}) ;
+ } else {
+ print $total_files_num . _(" files are found to be indexed.\n");
+ }
+
+ {
+ my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
+ my $fh_flist = util::efopen($var::NMZ{'_flist'});
+ my %field_indices = ();
+ get_field_index_base(\%field_indices);
+
+ if ($CheckPoint{'continue'}) {
+ seek($fh_flist, $flist_ptr, 0);
+ }
+
+ # Process target files one by one
+ while (defined(my $line = <$fh_flist>)) {
+ $flist_ptr += length($line);
+ my $cfile = $line;
+ chomp $cfile;
+ util::dprint(_("target file: ")."$cfile\n");
+
+ my ($cfile_size, $num) =
+ process_file($cfile, $docid_count, $docid_base,
+ $file_count, \%field_indices,
+ $fh_errorsfile, $total_files_num);
+ if ($num == 0) {
+ $total_files_num--;
+ next;
+ } else {
+ $docid_count += $num;
+ $file_count++;
+ }
+
+ $total_files_size += $cfile_size;
+ $processed_files_size += $cfile_size;
+ last if $ReceiveTERM;
+ if ($processed_files_size > $conf::ON_MEMORY_MAX) {
+ if (%KeyIndex) {
+ $key_count = write_index();
+ print _("Writing index files...");
+ write_phrase_hash();
+ print "\n";
+ }
+ $processed_files_size = 0;
+ $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
+ }
+ }
+
+ util::fclose($fh_flist);
+ util::fclose($fh_errorsfile);
+ }
+ # This should be out of above blocks because of file handler closing.
+ re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
+ $total_files_size, $total_files_num,
+ $file_count, $key_count) if $checkpoint;
+
+ if (%KeyIndex) {
+ $key_count = write_index();
+ print _("Writing index files...");
+ write_phrase_hash();
+ print "\n";
+ }
+
+ $key_count = get_total_keys() unless $key_count;
+ do_remain_job($total_files_size, $docid_count, $key_count,
+ $start_time);
+ exit 0;
+}
+
+#
+# FIXME: Very complicated.
+#
+sub process_file ($$$$\%$$) {
+ my ($cfile, $docid_count, $docid_base, $file_count,
+ $field_indices, $fh_errorsfile, $total_files_num) = @_;
+
+ my $processed_num = 0;
+ my $file_size = util::filesize($cfile);
+
+ if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
+ my @parts;
+ @parts = htmlsplit::split($cfile, "NMZ.partial")
+ if ($file_size <= $conf::FILE_SIZE_MAX);
+ if (@parts > 1) {
+ my $id = 0;
+ for my $part (@parts) {
+ next if (defined $conf::EXCLUDE_PATH &&
+ "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
+ my $fname = util::tmpnam("NMZ.partial.$id");
+ my $fragment = defined $part ? $part : undef;
+ my $uri = generate_uri($cfile, $fragment);
+ my $result = namazu_core($fname,
+ $docid_count + $processed_num,
+ $docid_base, $file_count,
+ $field_indices, $fh_errorsfile,
+ $total_files_num,
+ $uri, $id, $#parts);
+ if ($result > 0) {
+ $processed_num++;
+ my $rname = defined $part ? "$cfile\t$part" : "$cfile";
+ put_registry($rname);
+ }
+ unlink $fname;
+ $id++;
+ }
+ return ($file_size, $processed_num);
+ }
+ }
+ my $result = namazu_core($cfile, $docid_count, $docid_base,
+ $file_count, $field_indices,
+ $fh_errorsfile, $total_files_num,
+ undef, undef, undef);
+ if ($result > 0) {
+ $processed_num++;
+ put_registry($cfile);
+ }
+
+ return ($file_size, $processed_num);
+}
+
+#
+# Load mknmzrcs:
+#
+# 1. MKNMZRC environment
+#
+# 2. $(sysconfdir)/$(PACKAGE)/mknmzrc
+#
+# 3. ~/.mknmzrc
+#
+# 4. user-specified mknmzrc set by mknmz --config=file option.
+#
+# If multiple files exists, read all of them.
+#
+sub load_rcfiles () {
+ my (@cand) = ();
+
+ # To support Windows. Since they have nasty drive letter convention,
+ # it is necessary to change mknmzrc dynamically with env. variable.
+ push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
+ push @cand, "$CONFDIR/mknmzrc";
+ push @cand, "$ENV{'HOME'}/.mknmzrc";
+
+ util::vprint(_("Reading rcfile: "));
+ for my $rcfile (@cand) {
+ if (-f $rcfile) {
+ load_rcfile ($rcfile);
+ util::vprint(" $rcfile");
+ }
+ }
+ util::vprint("\n");
+}
+
+sub load_rcfile ($) {
+ my ($rcfile) = @_;
+ if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
+ util::win32_yen_to_slash(\$rcfile);
+ }
+ return if (grep {m/^$rcfile$/} @LoadedRcfiles);
+ do $rcfile;
+ if ($@) {
+ chop $@;
+ push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
+ }else {
+ push @LoadedRcfiles, $rcfile;
+ }
+
+ # Dirty workaround.
+ $LIBDIR = $conf::LIBDIR
+ if (defined $conf::LIBDIR && -d $conf::LIBDIR);
+ $FILTERDIR = $conf::FILTERDIR
+ if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
+ $TEMPLATEDIR = $conf::TEMPLATEDIR
+ if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
+}
+
+sub re_exec($$$$$$$$) {
+ my ($flist_ptr, $docid_count, $docid_base, $start_time,
+ $total_files_size, $total_files_num, $file_count, $key_count) = @_;
+
+ # store variables
+ {
+ my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
+
+ print $fh_checkpoint <<EOM;
+ \$DeletedFilesCount = $DeletedFilesCount;
+ \$UpdatedFilesCount = $UpdatedFilesCount;
+ \$APPENDMODE = $APPENDMODE;
+ \$flist_ptr = $flist_ptr;
+ \$docid_count = $docid_count;
+ \$docid_base = $docid_base;
+ \$start_time = $start_time;
+ \$total_files_size = $total_files_size;
+ \$total_files_num = $total_files_num;
+ \$key_count = $key_count;
+ \$file_count = $file_count;
+ \$\$ = $$;
+EOM
+ util::fclose($fh_checkpoint);
+ }
+
+ @ARGV = ("-S", @ARGV) ;
+ print _("Checkpoint reached: re-exec mknmz...\n");
+ util::dprint(join ' ', ("::::", @ARGV, "\n"));
+ exec ($0, @ARGV) ;
+}
+
+sub put_registry ($) {
+ my ($filename) = @_;
+ my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
+ print $fh_registry $filename, "\n";
+ util::fclose($fh_registry);
+}
+
+
+# Initialization
+# $CodingSystem: Character Coding System 'euc' or 'sjis'
+sub init () {
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ $CodingSystem = "sjis";
+ if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
+ $CONFDIR = $1 . $CONFDIR ;
+ }
+ if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
+ $LIBDIR = $1 . $LIBDIR ;
+ }
+ if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
+ $FILTERDIR = $1 . $FILTERDIR ;
+ }
+ if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
+ $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
+ }
+ } else {
+ $CodingSystem = "euc";
+ }
+
+ $SIG{'INT'} = sub {
+ util::cdie("SIGINT caught! Aborted.\n");
+ };
+
+ $SIG{'TERM'} = sub {
+ print STDERR "SIGTERM caught!\n";
+ $ReceiveTERM = 1;
+ };
+}
+
+sub preload_modules () {
+ unshift @INC, $LIBDIR;
+ # workaround for test suites.
+ unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
+
+ require "var.pl" || die "unable to require \"var.pl\"\n";
+ require "conf.pl" || die "unable to require \"conf.pl\"\n";
+ require "util.pl" || die "unable to require \"util.pl\"\n";
+ require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
+ require "ext.pl" || die "unable to require \"ext.pl\"\n";
+}
+
+sub postload_modules () {
+ require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
+}
+
+sub load_modules () {
+ require "usage.pl" || die "unable to require \"usage.pl\"\n";
+ require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
+ require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
+ require "seed.pl" || die "unable to require \"seed.pl\"\n";
+ require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
+
+ @Seed = seed::init();
+}
+
+sub load_filtermodules () {
+ unshift @INC, $FILTERDIR;
+
+ #
+ # Windows modules must be loaded first.
+ # Because OLE filters have low precedence over normal ones.
+ #
+ load_win32modules() if $English::OSNAME eq "MSWin32";
+
+ # Check filter modules
+ my @filters = ();
+ @filters = glob "$FILTERDIR/*.pl";
+
+ load_filters(@filters);
+}
+
+sub load_win32modules () {
+ # Check filter modules
+ my @filters = ();
+ if (-f "../filter/win32/olemsword.pl") { # to ease developing
+ @filters = glob "../filter/win32/*.pl";
+ unshift @INC, "../filter/win32";
+ } else {
+ @filters = glob "$FILTERDIR/win32/*.pl";
+ unshift @INC, "$FILTERDIR/win32";
+ }
+
+ load_filters(@filters);
+}
+
+sub load_filters (@) {
+ my @filters = @_;
+
+ for my $filter (@filters) {
+ $filter =~ m!([-\w]+)\.pl$!;
+ my $module = $1;
+ require "$module.pl" || die "unable to require \"$module.pl\"\n";;
+ my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
+
+ eval "\@mtypes = ${module}::mediatype();";
+ die $@ if $@; # eval error
+ eval "\$status = ${module}::status();";
+ die $@ if $@;
+ eval "\$recursive = ${module}::recursive();";
+ die $@ if $@;
+ eval "\$pre_codeconv = ${module}::pre_codeconv();";
+ die $@ if $@;
+ eval "\$post_codeconv = ${module}::post_codeconv();";
+ die $@ if $@;
+ eval "${module}::add_magic(\$Magic);";
+ die $@ if $@;
+
+ for my $mt (@mtypes) {
+ next if (defined $var::Supported{$mt} &&
+ $var::Supported{$mt} eq 'yes' && $status eq 'no');
+ $var::Supported{$mt} = $status;
+ $var::REQUIRE_ACTIONS{$mt} = $module;
+ $var::RECURSIVE_ACTIONS{$mt} = $recursive;
+ $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
+ $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
+ }
+ }
+}
+
+# Core routine.
+#
+# FIXME: Too many parameters. They must be cleared.
+#
+sub namazu_core ($$$$$$$$$$) {
+ my ($cfile, $docid_count, $docid_base,
+ $file_count, $field_indices, $fh_errorsfile, $total_files_num,
+ $uri, $part_id, $part_num) = @_;
+
+ my $headings = "";
+ my $content = "";
+ my $weighted_str = "";
+ my %fields;
+ my $msg_prefix;
+
+ if ($part_id) {
+ $msg_prefix = " $part_id/$part_num - ";
+ } else {
+ $msg_prefix = $file_count + 1 . "/$total_files_num - ";
+ }
+
+ unless ($uri) {
+ $uri = generate_uri($cfile); # Make a URI from a file name.
+ }
+ my ($cfile_size, $text_size, $kanji, $mtype) =
+ load_document(\$cfile, \$content, \$weighted_str,
+ \$headings, \%fields);
+
+ {
+ $fields{'mtime'} = (stat($cfile))[9];
+ my $utc = $fields{'mtime'};
+ $utc = time::rfc822time_to_mtime($fields{'date'})
+ if (defined $fields{'date'});
+ if ($utc == -1) {
+ my $date = $fields{'date'};
+ print "$cfile Illegal date format. : $date\n";
+ print $fh_errorsfile "$cfile Illegal date format. : $date\n";
+ $utc = $fields{'mtime'};
+ delete $fields{'date'};
+ }
+ $fields{'utc'} = $utc;
+ }
+
+ util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
+
+ # Check if the file is acceptable.
+ my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
+ if (defined $err) {
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ my $uri2 = codeconv::eucjp_to_shiftjis($uri);
+ print $msg_prefix . "$uri2 $err\n";
+ } else {
+ print $msg_prefix . "$uri $err\n";
+ }
+ print $fh_errorsfile "$cfile $err\n";
+ return 0; # return 0 if error
+ }
+
+ # Print processing file name as URI.
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ my $uri2 = codeconv::eucjp_to_shiftjis($uri);
+ print $msg_prefix . "$uri2 [$mtype]\n";
+ } else {
+ print $msg_prefix . "$uri [$mtype]\n";
+ }
+
+ # Add filename.
+ my $filename = defined $cfile ? $cfile : '';
+ codeconv::toeuc(\$filename);
+ $filename = basename($filename);
+ $fields{'filename'} = $filename;
+
+ complete_field_info(\%fields, $cfile, $uri,
+ \$headings, \$content, \$weighted_str);
+ put_field_index(\%fields, $field_indices);
+
+ put_dateindex($cfile);
+ $content .= "\n\n$filename\n\n"; # add filename
+ $content .= $weighted_str; # add weights
+ count_words($docid_count, $docid_base, \$content, $kanji);
+ make_phrase_hash($docid_count, $docid_base, \$content);
+
+ # assertion
+ util::assert($cfile_size != 0,
+ "cfile_size == 0 at the end of namazu_core.");
+
+ return $cfile_size;
+}
+
+#
+# Make the URI from the given file name.
+#
+sub generate_uri (@) {
+ my ($file, $fragment) = @_;
+ return "" unless defined $file;
+
+ # omit a file name if omittable
+ $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
+
+ if (defined $ReplaceCode) {
+ # transforming URI by evaling
+ $_ = $file;
+ eval $ReplaceCode;
+ $file = $_;
+ }
+
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
+ }
+
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ $file = codeconv::shiftjis_to_eucjp($file);
+ }
+ if (defined $fragment) {
+ codeconv::toeuc(\$fragment);
+ }
+
+ unless ($var::Opt{'noencodeuri'}) {
+ for my $tmp ($file, $fragment) {
+ next unless defined $tmp;
+
+ # Escape unsafe characters (not strict)
+ $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2
+ $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
+ sprintf("%%%02X",ord($1))/ge;
+ }
+ }
+
+
+ my $uri = $file;
+ $uri .= "#" . $fragment if defined $fragment;
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ # restore '|' for drive letter rule of Win32, OS/2
+ $uri =~ s!^/([A-Z])%7C!/$1|!i;
+ }
+ return $uri;
+}
+
+
+sub get_field_index_base (\%) {
+ my ($field_indices) = @_;
+
+ my @keys = split('\|', $conf::SEARCH_FIELD);
+ if ($var::Opt{'meta'}) {
+ push @keys, (split '\|', $conf::META_TAGS);
+ }
+ for my $key (@keys) {
+ $key = lc($key);
+ my $fname = "$var::NMZ{'field'}.$key";
+ my $tmp_fname = util::tmpnam("NMZ.field.$key");
+ my $size = 0;
+ $size = -s $fname if -f $fname;
+ $size += -s $tmp_fname if -f $tmp_fname;
+ $field_indices->{$key} = $size;
+ }
+}
+
+sub complete_field_info (\%$$\$\$\$) {
+ my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
+
+ for my $field (keys %{$fields}) {
+ if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
+ delete $fields->{$field};
+ }
+ }
+
+ unless (defined($fields->{'title'})) {
+ $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
+ }
+ unless (defined($fields->{'date'})) {
+ my $mtime = $fields->{'mtime'};
+ my $date = util::rfc822time($mtime);
+ $fields->{'date'} = $date;
+ }
+ unless (defined($fields->{'uri'})) {
+ $fields->{'uri'} = $uri;
+ }
+ unless (defined($fields->{'size'})) {
+ $fields->{'size'} = -s $cfile;
+ }
+ unless (defined($fields->{'summary'})) {
+ $fields->{'summary'} = make_summary($contref, $headings, $cfile);
+ }
+ unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
+ $fields->{'from'} = getmsg("unknown");
+ }
+}
+
+#
+# Currently, messages for NMZ.* files should be encoded in
+# EUC-JP currently. We cannot use gettext.pl for the messsage
+# because gettext.pl may use Shift_JIS encoded messages.
+# So, we should use the function instead of gettext().
+#
+# FIXME: Ad hoc impl. getmsg() is effective only for "unknown".
+#
+sub getmsg($) {
+ my ($msg) = @_;
+
+ if (util::islang_msg("ja")) {
+ if ($msg eq "unknown") {
+ return "ÉÔÌÀ";
+ }
+ }
+ return $msg;
+}
+
+sub make_summary ($$$) {
+ my ($contref, $headings, $cfile) = @_;
+
+ # pick up $conf::MAX_FIELD_LENGTH bytes string
+ my $tmp = "";
+ if ($$headings ne "") {
+ $$headings =~ s/^\s+//;
+ $$headings =~ s/\s+/ /g;
+ $tmp = $$headings;
+ } else {
+ $tmp = "";
+ }
+
+ my $offset = 0;
+ my $tmplen = 0;
+ while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
+ && $offset < length($$contref))
+ {
+ $tmp .= substr $$contref, $offset, $tmplen;
+ $offset += $tmplen;
+ $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
+ $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
+ }
+
+ # -1 means "LF"
+ my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
+ # Remove a garbage Kanji 1st char at the end.
+ $summary = codeconv::chomp_eucjp($summary);
+
+ $summary =~ s/^\s+//;
+ $summary =~ s/\s+/ /g; # normalize white spaces
+
+ return $summary;
+}
+
+
+# output the field infomation into NMZ.fields.* files
+sub put_field_index (\%$) {
+ my ($fields, $field_indices) = @_;
+
+ my $aliases_regex =
+ join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
+
+ for my $field (keys %{$fields}) {
+ util::dprint("Field: $field: $fields->{$field}\n");
+ if ($field =~ /^($aliases_regex)$/o) {
+ unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
+ $fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
+ }
+ undef $fields->{$field};
+ }
+ }
+
+ my @keys = split '\|', $conf::SEARCH_FIELD;
+ if ($var::Opt{'meta'}) {
+ my @meta = split '\|', $conf::META_TAGS;
+ while (my $meta = shift(@meta)) {
+ $meta = $conf::FIELD_ALIASES{$meta}
+ if (defined $conf::FIELD_ALIASES{$meta});
+
+ push @keys, $meta;
+ }
+
+ # uniq @keys
+ my %mark = ();
+ @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
+ }
+ for my $key (@keys) {
+ my $lkey = lc($key);
+ my $fname = util::tmpnam("NMZ.field.$lkey");
+ my $fh_field = util::efopen(">>$fname");
+ my $output = "";
+ if (defined($fields->{$key})) {
+ if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
+ $fields->{$key} =~ s/\s+/ /g;
+ $fields->{$key} =~ s/\s+$//;
+ $fields->{$key} =~ s/^\s+//;
+ }
+ $output = $fields->{$key};
+
+ # -1 means "LF"
+ $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
+ # Remove a garbage Kanji 1st char at the end.
+ $output = codeconv::chomp_eucjp($output);
+
+ $output =~ s/\n.*$//s;
+ $output .= "\n";
+ } else {
+ $output = "\n";
+ }
+ print $fh_field $output;
+ util::fclose($fh_field);
+
+ # put index of field index
+ {
+ my $fname = util::tmpnam("NMZ.field.$lkey.i");
+ my $fh_field_idx = util::efopen(">>$fname");
+ print $fh_field_idx pack("N", $field_indices->{$lkey});
+ $field_indices->{$lkey} += length $output;
+ util::fclose($fh_field_idx);
+ }
+ }
+
+}
+
+# put the date infomation into NMZ.t file
+sub put_dateindex ($) {
+ my ($cfile) = @_;
+ my $mtime = (stat($cfile))[9];
+
+ my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
+ print $fh_dataindex pack("N", $mtime);
+ util::fclose($fh_dataindex);
+}
+
+
+# load a document file
+sub load_document ($$$$\%) {
+ my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
+ = @_;
+ my $cfile = $$orig_cfile;
+
+ return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
+
+ # for handling a filename which contains Shift_JIS code for Windows.
+ # for handling a filename which contains including space.
+ my $shelter_cfile = "";
+ if (($cfile =~ /\s/) ||
+ ($English::OSNAME eq "MSWin32"
+ && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
+ {
+ $shelter_cfile = $cfile;
+ $cfile = util::tmpnam("NMZ.win32");
+ unlink $cfile if (-e $cfile);
+ copy($shelter_cfile, $cfile);
+ }
+
+ my $file_size;
+ $file_size = util::filesize($cfile); # not only file in feature.
+ if ($file_size > $conf::FILE_SIZE_MAX) {
+ return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
+ }
+
+ $$contref = util::readfile($cfile);
+# $file_size = length($$contref);
+
+ my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
+
+ if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
+ unlink $cfile;
+ $cfile = $shelter_cfile;
+ }
+
+ # Measure the text size at this time.
+ my $text_size = length($$contref) + length($$weighted_str);
+
+ return ($file_size, $text_size, $kanji, $mtype);
+}
+
+sub apply_filter($$$$$$$) {
+ my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
+ = @_;
+ my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
+
+ # Filtering process.
+ my $mtype;
+ my $called_dt = 0;
+ while (1) {
+ if (defined $MediaType) {
+ $mtype = $MediaType;
+ } elsif (defined $mmtype) {
+ $mtype = $mmtype;
+ } else {
+ my $mtype_n = $Magic->checktype_byfilename($cfile);
+ my $mtype_c = $Magic->checktype_data($$contref);
+ my $mtype_m;
+ $mtype_m = $Magic->checktype_magic($$contref)
+ if ((! defined $mtype_c) ||
+ $mtype_c =~
+ /^(text\/html|text\/plain|application\/octet-stream)$/);
+ $mtype_c = $mtype_m
+ if (defined $mtype_m &&
+ $mtype_m !~
+ /^(text\/html|text\/plain|application\/octet-stream)$/);
+ $mtype_c = 'text/plain' unless defined $mtype_c;
+ if ($called_dt) {
+ $mtype = $mtype_c;
+ } else {
+ $mtype = decide_type($mtype_n, $mtype_c);
+ $called_dt = 1;
+ }
+ }
+ util::dprint(_("Detected type: ")."$mtype\n");
+
+ # Pre code conversion.
+ if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
+ util::dprint("pre_codeconv\n");
+ codeconv_document($contref);
+ }
+
+ if (! $var::Supported{$mtype} ||
+ $var::Supported{$mtype} ne 'yes')
+ {
+ util::vprint(_("Unsupported media type ")."$mtype\n");
+ return (0, "$mtype; x-system=unsupported");
+ }
+
+ if ($var::REQUIRE_ACTIONS{$mtype}) {
+ util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
+ require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
+ || die _("unable to require ") .
+ "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
+ my $err = undef;
+ {
+ local $SIG{'PIPE'} = \&trapintr;
+ eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
+ '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
+ }
+ if ($err) {
+ if ($err =~ m/; x-system=unsupported$/) {
+ return (0, $err);
+ }
+ return (0, "$mtype; x-error=$err");
+ }
+
+ if ($@) {
+ util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
+ return (0, "$mtype; x-error=$@");
+ }
+
+ # Post code conversion.
+ if ($var::REQUIRE_POST_CODECONV{$mtype}) {
+ util::dprint("post_codeconv\n");
+ codeconv_document($contref);
+ }
+
+ next if ($var::RECURSIVE_ACTIONS{$mtype});
+ }
+ last;
+ }
+
+ my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained?
+ $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
+
+ return ($kanji, $mtype);
+}
+
+sub codeconv_document ($) {
+ my ($textref) = @_;
+ codeconv::toeuc($textref);
+ $$textref =~ s/\r\n/\n/g;
+ $$textref =~ s/\r/\n/g;
+ $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
+}
+
+sub prep () {
+ my $docid_base = 0;
+ my $output_dir = shift @_ ;
+ my @targets = @_ ;
+ my @flist = ();
+
+ $var::OUTPUT_DIR = $output_dir;
+
+ require_modules();
+ change_filenames();
+ check_present_index();
+
+ # if Checkpoint mode, return
+ return (0, 0) if $CheckPoint{'continue'};
+
+ check_lockfile($var::NMZ{'lock2'});
+ print _("Looking for indexing files...\n");
+ @flist = find_target(@targets);
+ ($docid_base, @flist) = append_index(@flist)
+ if -f $var::NMZ{'r'};
+ unless (@flist) { # if @flist is empty
+ print _("No files to index.\n");
+ exit 0;
+ }
+ set_lockfile($var::NMZ{'lock2'});
+ save_flist(@flist);
+ my $total_files_num = @flist;
+
+ return ($docid_base, $total_files_num);
+}
+
+sub save_flist(@) {
+ my @flist = @_;
+ return if (@flist == 0);
+
+ my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
+ print $fh_flist join("\n", @flist), "\n";
+ util::fclose($fh_flist);
+}
+
+sub require_modules() {
+ if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
+ require NKF || die "unable to require \"NKF\"\n";
+ util::dprint(_("code conversion: using NKF module\n"));
+ $var::USE_NKF_MODULE = 1;
+ }
+ if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
+ require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
+ util::dprint(_("wakati: using Text::Kakasi module\n"));
+ my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
+ }
+ if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
+ require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
+ util::dprint(_("wakati: using Text::ChaSen module\n"));
+ my @arg = ('-i', 'e', '-j', '-F', '%m ');
+ @arg = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
+ my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
+ }
+ if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
+ require MeCab || die "unable to require \"MeCab\"\n";
+ util::dprint(_("wakati: using MeCab module\n"));
+ }
+}
+
+sub check_lockfile ($) {
+ # warn if check file exists in case other process is running or abnormal
+ # stop execution (later is not the major purpose, though).
+ # This is mainly for early detection before longish find_target.
+ my ($file) = @_;
+
+ if (-f $file) {
+ print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
+ exit 1;
+ }
+}
+
+sub set_lockfile ($) {
+ my ($file) = @_;
+
+ # make a lock file
+ if (-f $file) {
+ print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
+ exit 1;
+ } else {
+ my $fh_lockfile = util::efopen(">$file");
+ print $fh_lockfile "$$"; # save pid
+ util::fclose($fh_lockfile);
+ }
+}
+
+sub remove_lockfile ($) {
+ my ($file) = @_;
+
+ # remove lock file
+ unlink $file if -f $file;
+}
+
+# check present index whether it is old type of not
+sub check_present_index () {
+ if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}")
+ {
+ util::cdie(_("Present index is old type. it's unsupported.\n"));
+ }
+}
+
+# remain
+sub do_remain_job ($$$$) {
+ my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
+
+ if ($docid_count == 0) {
+ # No files are indexed
+ if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
+ update_dateindex();
+ update_registry($docid_count);
+ }
+ } else {
+ set_lockfile($var::NMZ{'lock'});
+ write_version();
+ write_body_msg();
+ write_tips_msg();
+ write_result_file();
+ update_field_index();
+ update_dateindex();
+ update_registry($docid_count);
+ write_nmz_files();
+ make_slog_file();
+ remove_lockfile($var::NMZ{'lock'});
+ }
+ make_headfoot_pages($docid_count, $key_count);
+ put_log($total_files_size, $start_time, $docid_count, $key_count);
+ util::remove_tmpfiles();
+ unlink $var::NMZ{'_flist'};
+}
+
+sub make_headfoot_pages($$) {
+ my ($docid_count, $key_count) = @_;
+
+ for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
+ if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
+ my $suffix = $1 ? $1 : '';
+ make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
+ }
+ }
+ for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
+ if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
+ my $suffix = $1 ? $1 : '';
+ make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
+ }
+ }
+}
+
+# Parse command line options.
+sub parse_options
+{
+ if (@ARGV == 0) {
+ show_mini_usage();
+ exit 1;
+ }
+
+ my @targets = ();
+ my $targets_loaded = 0;
+ my @argv = @ARGV;
+ my $cwd = cwd();
+
+ my $opt_dummy = 0;
+ my $opt_version = 0;
+ my $opt_help = 0;
+ my $opt_all = 0;
+ my $opt_chasen = 0;
+ my $opt_chasen_noun = 0;
+ my $opt_kakasi = 0;
+ my $opt_mecab = 0;
+ my $opt_checkpoint_sub = 0;
+ my $opt_show_config = 0;
+ my $opt_mailnews = 0;
+ my $opt_mhonarc = 0;
+ my $opt_norc = 0;
+
+ my $opt_quiet = undef;
+ my $opt_config = undef;
+ my $output_dir = undef;
+ my $update_index = undef;
+ my $include_file = undef;
+ my $target_list = undef;
+ my $index_lang = undef;
+
+ my %opt_conf;
+
+# Getopt::Long::Configure('bundling');
+ Getopt::Long::config('bundling');
+ GetOptions(
+ '0|help' => \$opt_help,
+ '1|exclude=s' => \$opt_conf{'EXCLUDE_PATH'},
+ '2|deny=s' => \$opt_conf{'DENY_FILE'},
+ '3|allow=s' => \$opt_conf{'ALLOW_FILE'},
+ '4|update=s' => \$update_index,
+ '5|mhonarc' => \$opt_mhonarc,
+ '6|mtime=s' => \$var::Opt{'mtime'},
+ '7|html-split' => \$var::Opt{'htmlsplit'},
+ 'C|show-config' => \$opt_show_config,
+ 'E|no-edge-symbol' => \$var::Opt{'noedgesymbol'},
+ 'F|target-list=s' => \$target_list,
+ 'G|no-okurigana' => \$var::Opt{'okurigana'},
+ 'H|no-hiragana' => \$var::Opt{'hiragana'},
+ 'I|include=s' => \$include_file,
+ 'K|no-symbol' => \$var::Opt{'nosymbol'},
+ 'L|indexing-lang=s' => \$index_lang,
+ 'M|meta' => \$var::Opt{'meta'},
+ 'O|output-dir=s' => \$output_dir,
+ 'S|checkpoint-sub' => \$opt_checkpoint_sub,
+ 'T|template-dir=s' => \$TEMPLATEDIR,
+ 'U|no-encode-uri' => \$var::Opt{'noencodeuri'} ,
+ 'V|verbose' => \$var::Opt{'verbose'},
+ 'Y|no-delete' => \$var::Opt{'nodelete'},
+ 'Z|no-update' => \$var::Opt{'noupdate'},
+ 'a|all' => \$opt_all,
+ 'b|use-mecab' => \$opt_mecab,
+ 'c|use-chasen' => \$opt_chasen,
+ 'd|debug' => \$var::Opt{'debug'},
+ 'e|robots' => \$var::Opt{'robotexclude'},
+ 'f|config=s' => \$opt_config,
+ 'h|mailnews' => \$opt_mailnews,
+ 'k|use-kakasi' => \$opt_kakasi,
+ 'm|use-chasen-noun' => \$opt_chasen_noun,
+ 'q|quiet' => \$opt_quiet,
+ 'r|replace=s' => \$ReplaceCode,
+ 's|checkpoint' => \$CheckPoint{'on'},
+ 't|media-type=s' => \$MediaType,
+ 'u|uuencode' => \$opt_dummy, # for backward compat.
+ 'v|version' => \$opt_version,
+ 'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
+ 'z|check-filesize' => \$var::Opt{'checkfilesize'},
+ 'decode-base64' => \$var::Opt{'decodebase64'},
+ 'norc' => \$opt_norc,
+ );
+
+ if ($opt_quiet) {
+ # Make STDOUT quiet by redirecting STDOUT to null device.
+ my $devnull = util::devnull();
+ open(STDOUT, ">$devnull") || die "$devnull: $!";
+ }
+
+ if (defined $update_index) {
+ unless (-d $update_index) {
+ print _("No such index: "), "$update_index\n";
+ exit 1;
+ }
+
+ my $orig_status = $var::NMZ{'status'};
+ $var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
+
+ my $argv = get_status("argv");
+ if (!defined $argv) {
+ print _("No such index: "), "$update_index\n";
+ exit 1;
+ }
+ @ARGV = split /\t/, $argv;
+ util::dprint(_("Inherited argv: ")."@ARGV\n");
+
+ my $cwd = get_status("cwd");
+ if (!defined $cwd) {
+ print _("No such index: "), "$update_index\n";
+ exit 1;
+ }
+ chdir $cwd;
+ util::dprint(_("Inherited cwd: ")."$cwd\n");
+
+ ($output_dir, @targets) = parse_options();
+ $output_dir = $update_index;
+ $var::NMZ{'status'} = $orig_status; # See also change_filenames()
+ return ($output_dir, @targets);
+ }
+
+ if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
+ load_rcfiles();
+ }
+ if ($opt_config) {
+ if (-f $opt_config) {
+ util::vprint(_("Reading rcfile: "));
+ load_rcfile($ConfigFile = $opt_config);
+ util::vprint(" $opt_config\n");
+ }
+ }
+
+ if ($index_lang) {
+ $util::LANG = $index_lang;
+ util::dprint("Override indexing language: $util::LANG\n");
+ }
+
+ if ($opt_help) {
+ show_usage();
+ exit 1;
+ }
+
+ if ($opt_version) {
+ show_version();
+ exit 1;
+ }
+
+ load_filtermodules(); # to make effect $opt_config, $index_lang.
+ postload_modules();
+
+ foreach my $key (keys %opt_conf){
+ if (defined ($opt_conf{$key})) {
+ ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
+ }
+ }
+
+ if ($opt_mailnews) {
+ $MediaType = 'message/rfc822';
+ }
+ if ($opt_mhonarc) {
+ $MediaType = 'text/html; x-type=mhonarc';
+ }
+ if ($opt_all) {
+ $conf::ALLOW_FILE = ".*";
+ }
+ if ($opt_chasen) {
+ $conf::WAKATI = $conf::CHASEN;
+ $var::Opt{'noun'} = 0;
+ }
+ if ($opt_chasen_noun) {
+ $conf::WAKATI = $conf::CHASEN_NOUN;
+ $var::Opt{'noun'} = 1;
+ }
+ if ($opt_kakasi) {
+ $conf::WAKATI = $conf::KAKASI;
+ $var::Opt{'noun'} = 0;
+ }
+ if ($opt_mecab) {
+ $conf::WAKATI = $conf::MECAB;
+ $var::Opt{'noun'} = 0;
+ }
+ if ($include_file) {
+ do $include_file;
+ util::dprint("Included: $include_file\n");
+ }
+ if ($target_list) {
+ if ($CheckPoint{'continue'}) {
+ @targets = ("dummy");
+ } else {
+ @targets = load_target_list($target_list);
+ util::dprint(_("Loaded: ")."$target_list\n");
+ }
+ $targets_loaded = 1;
+ }
+ if ($opt_checkpoint_sub) {
+ $CheckPoint{'on'} = 1;
+ $CheckPoint{'continue'} = 1;
+ @argv = grep {! /^-S$/} @argv; # remove -S
+ }
+
+ if (defined $ReplaceCode) {
+ my $orig = "/foo/bar/baz/quux.html";
+ $_ = $orig;
+ eval $ReplaceCode;
+ if ($@) { # eval error
+ util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
+ }
+ util::dprint(_("Replace: ")."$orig -> $_\n");
+ }
+
+ if ($opt_show_config) {
+ show_config();
+ exit 1;
+ }
+
+ if (@ARGV == 0 && $targets_loaded == 0) {
+ show_mini_usage();
+ exit 1;
+ }
+
+ $output_dir = $cwd unless defined $output_dir;
+ util::cdie("$output_dir: "._("invalid output directory\n"))
+ unless (-d $output_dir && -w $output_dir);
+
+ if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
+ util::win32_yen_to_slash(\$output_dir);
+ }
+
+ # take remaining @ARGV as targets
+ if (@ARGV > 0 && $targets_loaded == 0) {
+ @targets = @ARGV ;
+ }
+
+ # revert @ARGV
+ # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
+ @ARGV = @argv;
+
+ return ($output_dir, @targets);
+}
+
+sub show_config () {
+ print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
+ print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
+ print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
+ print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
+ print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
+ print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
+ print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
+ print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
+ print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
+ print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
+ print _("Lang_Msg: ") . "$util::LANG_MSG\n";
+ print _("Lang: ") . "$util::LANG\n";
+ print _("Coding System: ") . "$CodingSystem\n";
+ print _("CONFDIR: ") . "$CONFDIR\n";
+ print _("LIBDIR: ") . "$LIBDIR\n";
+ print _("FILTERDIR: ") . "$FILTERDIR\n";
+ print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
+
+ my @all_types = keys %var::Supported;
+ my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
+
+ my $num_supported = @supported;
+ my $num_unsupported = @all_types - @supported;
+ print _("Supported media types: ") . "($num_supported)\n";
+ print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
+ for my $mtype (sort keys %var::Supported) {
+ my $yn = $var::Supported{$mtype};
+ if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
+ print "$yn $mtype";
+ if ($var::REQUIRE_ACTIONS{$mtype}){
+ print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
+ }
+ print "\n";
+ }
+}
+
+sub load_target_list ($) {
+ my ($file) = @_;
+ my $fh_targets = util::efopen($file);
+ my @targets = <$fh_targets>;
+ util::fclose($fh_targets);
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ foreach my $tmp (@targets){
+ $tmp =~ s/\r//g;
+ util::win32_yen_to_slash(\$tmp);
+ }
+ }
+ chomp @targets;
+ return @targets;
+}
+
+# convert a relative path into an absolute path
+sub absolute_path($$) {
+ my ($cwd, $path) = @_;
+
+ $path =~ s!^\.$!\./!;
+ $path =~ s!^\.[/\\]!$cwd/!;
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ util::win32_yen_to_slash(\$path);
+ if ($path =~ m!^//!) {
+ } elsif ($path =~ m!^/[^/]!) {
+ my $driveletter = $cwd;
+ if ($driveletter =~ m!^([A-Z]:)!i){
+ $driveletter = $1;
+ }
+ $path = "$driveletter$path";
+ } elsif ($path !~ m!^[A-Z]:/!i) {
+ $path = "$cwd/$path";
+ }
+ } else {
+ $path =~ s!^([^/])!$cwd/$1!;
+ }
+ return $path;
+}
+
+sub find_target (@) {
+ my @targets = @_;
+
+ my $cwd = cwd();
+ @targets = map { absolute_path($cwd, $_) } @targets;
+
+ # Convert \ to / with consideration for Shift_JIS encoding.
+ if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
+ foreach my $tmp (@targets){
+ util::win32_yen_to_slash(\$tmp);
+ }
+ }
+
+ # For reporting effects of --allow, --deny, --exclude, --mtime
+ # options in --verbose mode.
+ my %counts = ();
+ $counts{'possible'} = 0;
+ $counts{'excluded'} = 0;
+ $counts{'too_old'} = 0;
+ $counts{'too_new'} = 0;
+ $counts{'not_allowed'} = 0;
+ $counts{'denied'} = 0;
+
+ # Traverse directories.
+ # This routine is not efficent but I prefer reliable logic.
+ my @flist = ();
+ my $start = time();
+ util::vprint(_("find_target starting: "). localtime($start). "\n");
+ while (@targets) {
+ my $target = shift @targets;
+
+ if ($target eq '') {
+ print STDERR "Warning: target contains empty line, skip it\n";
+ next;
+ }
+
+ if (-f $target) { # target is a file.
+ add_target($target, \@flist, \%counts);
+ } elsif (-d $target) { # target is a directory.
+ my @subtargets = ();
+ # Find subdirectories in target directory
+ # because File::Find::find() does not follow symlink.
+ if (-l $target) {
+ my $dh = new DirHandle($target);
+ while (defined(my $ent = $dh->read)) {
+ next if ($ent =~ /^\.{1,2}$/);
+ if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
+ next if ($ent =~ m!^($conf::DENY_DDN)$!i);
+ my $tmp = $ent;
+ util::win32_yen_to_slash(\$tmp);
+ next if ($ent ne $tmp);
+ }
+ my $fname = "$target/$ent";
+ next if ($fname eq '.' || $fname eq '..');
+ if (-d $fname) {
+ push(@subtargets, $fname);
+ } else {
+ add_target($fname, \@flist, \%counts);
+ }
+ }
+ } else {
+ @subtargets = ($target);
+ }
+
+ #
+ # Wanted routine for File::Find's find().
+ #
+ my $wanted_closure = sub {
+ my $fname = "$File::Find::dir/$_";
+ add_target($fname, \@flist, \%counts);
+ };
+
+ find($wanted_closure, @subtargets) if (@subtargets > 0);
+ } else {
+ print STDERR _("unsupported target: ") . $target;
+ }
+ }
+
+ # uniq @flist
+ my %mark = ();
+ @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
+
+ # Sort file names with consideration for numbers.
+ @flist = map { $_->[0] }
+ sort { $a->[1] cmp $b->[1] }
+ map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
+ [ $_, $tmp ] } @flist;
+
+ my $elapsed = time() - $start ;
+ $elapsed += 1 ; # to round up and avoid 0
+
+ # For --verbose option.
+ report_find_target($elapsed, $#flist + 1, %counts);
+
+ return @flist;
+}
+
+sub add_target ($\@\%) {
+ my ($target, $flists_ref, $counts_ref) = @_;
+
+ if ($target =~ /[\n\r\t]/) {
+ $target =~ s/[\n\r\t]//g;
+ print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
+ return; # skip a file name containing LF/CR/TAB chars.
+ }
+
+ return unless -f $target; # Only file is targeted.
+
+ $counts_ref->{'possible'}++;
+
+ unless (util::canopen($target)) {
+ util::vprint(sprintf(_("Unreadable: %s"), $target));
+ $counts_ref->{'excluded'}++;
+ return;
+ }
+
+
+ if (defined $conf::EXCLUDE_PATH &&
+ $target =~ /$conf::EXCLUDE_PATH/ )
+ {
+ util::vprint(sprintf(_("Excluded: %s"), $target));
+ $counts_ref->{'excluded'}++;
+ return;
+ }
+
+ #
+ # Do processing just like find's --mtime option.
+ #
+ if (defined $var::Opt{'mtime'}) {
+ my $mtime = -M $_;
+ if ($var::Opt{'mtime'} < 0) {
+
+ # This must be `>=' not `>' for consistency with find(1).
+ if (int($mtime) >= - $var::Opt{'mtime'}) {
+ util::vprint(sprintf(_("Too old: %s"), $target));
+ $counts_ref->{'too_old'}++;
+ return;
+ }
+ } elsif ($var::Opt{'mtime'} > 0) {
+ if ($var::Opt{'mtime'} =~ /^\+/) {
+ if ((int($mtime) < $var::Opt{'mtime'})) {
+ util::vprint(sprintf(_("Too new: %s"), $target));
+ $counts_ref->{'too_new'}++;
+ return;
+ }
+ } else {
+ if (int($mtime) != $var::Opt{'mtime'}) {
+ if (int($mtime) > $var::Opt{'mtime'}) {
+ util::vprint(sprintf(_("Too old: %s"),$target));
+ $counts_ref->{'too_old'}++;
+ } else {
+ util::vprint(sprintf(_("Too new: %s"),$target));
+ $counts_ref->{'too_new'}++;
+ }
+ return;
+ }
+ }
+ } else {
+ # $var::Opt{'mtime'} == 0 ;
+ return;
+ }
+ }
+
+ # Extract the file name of the target.
+ $target =~ m!^.*/([^/]+)$!;
+ my $fname = $1;
+
+ if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
+ util::vprint(sprintf(_("Denied: %s"), $target));
+ $counts_ref->{'denied'}++;
+ return;
+ }
+ if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
+ util::vprint(sprintf(_("Not allowed: %s"), $target));
+ $counts_ref->{'not_allowed'}++;
+ return;
+ } else{
+ util::vprint(sprintf(_("Targeted: %s"), $target));
+ push @$flists_ref, $target;
+ }
+
+}
+
+sub report_find_target ($$%) {
+ my ($elapsed, $num_targeted, %counts) = @_;
+
+ util::vprint(_("find_target finished: ") . localtime(time()). "\n");
+ util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
+ $num_targeted, $elapsed,
+ $num_targeted /$elapsed));
+ util::vprint(sprintf(_(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
+ $counts{'possible'},
+ $counts{'not_allowed'},
+ $counts{'denied'},
+ $counts{'excluded'}));
+ util::vprint(sprintf(_(" MTIME too old: %d, MTIME too new: %d"),
+ $counts{'too_old'},
+ $counts{'too_new'}));
+}
+
+sub show_usage () {
+ util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
+ util::dprint(_("lang: ")."$util::LANG\n");
+
+ my $usage = $usage::USAGE;
+ $usage = _($usage);
+ printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
+}
+
+sub show_mini_usage () {
+ print _("Usage: mknmz [options] <target>...\n");
+ print _("Try `mknmz --help' for more information.\n");
+}
+
+sub show_version () {
+ print $usage::VERSION_INFO;
+}
+
+#
+# check the file. No $msg is good.
+#
+sub check_file ($$$$$) {
+ my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
+
+ my $msg = undef;
+ if ($mtype =~ /; x-system=unsupported$/) {
+ $mtype =~ s/; x-system=unsupported$//;
+ $msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
+ } elsif ($mtype =~ /; x-error=file_size_max/) {
+ $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
+ } elsif ($mtype =~ /; x-error=.*$/) {
+ $mtype =~ s/^.*; x-error=(.*)$/$1/;
+ $msg = $mtype;
+ } elsif ($mtype =~ /^x-system/) {
+ $msg = _("system error occurred! ")."($mtype)"._(" skipped.");
+ } elsif (! -e $cfile) {
+ $msg = _("does NOT EXIST! skipped.");
+ } elsif (! util::canopen($cfile)) {
+ $msg = _("is NOT READABLE! skipped.");
+ } elsif ($text_size == 0 || $cfile_size == 0) {
+ $msg = _("is 0 size! skipped.");
+ } elsif ($mtype =~ /^application\/octet-stream/) {
+ $msg = _("may be a BINARY file! skipped.");
+ } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
+ $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
+ } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
+ $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
+ }
+
+ return $msg;
+}
+
+
+#
+# Write NMZ.version file.
+#
+sub write_version() {
+ unless (-f $var::NMZ{'version'}) {
+ my $fh = util::efopen(">$var::NMZ{'version'}");
+ print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
+ util::fclose($fh);
+ }
+}
+
+#
+# rename each temporary file to a real file name.
+#
+sub write_nmz_files () {
+ util::Rename($var::NMZ{'_i'}, $var::NMZ{'i'});
+ util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
+ util::Rename($var::NMZ{'_w'}, $var::NMZ{'w'});
+ util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
+ util::Rename($var::NMZ{'_p'}, $var::NMZ{'p'});
+ util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
+}
+
+# output NMZ.body
+sub write_body_msg () {
+ for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
+ if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
+ my $suffix = $1 ? $1 : '';
+ write_message("$var::NMZ{'body'}${suffix}");
+ }
+ }
+}
+
+# output NMZ.tips
+sub write_tips_msg () {
+ for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
+ if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
+ my $suffix = $1 ? $1 : '';
+ write_message("$var::NMZ{'tips'}${suffix}");
+ }
+ }
+}
+
+
+# output NMZ.result.*
+sub write_result_file () {
+ my $fname = "NMZ.result.normal";
+
+ my @files = glob "$TEMPLATEDIR/NMZ.result.*";
+
+ for my $file (@files) {
+ $file =~ m!(NMZ\.result\.[^/]*)$!;
+ my $target = "$var::OUTPUT_DIR/$1";
+ if (-f $target) { # already exist;
+ next;
+ } else {
+ my $buf = util::readfile($file);
+ my $fh_file = util::efopen(">$target");
+ print $fh_file $buf;
+ util::fclose($fh_file);
+ }
+ }
+}
+
+# write NMZ.body and etc.
+sub write_message ($) {
+ my ($msgfile) = @_;
+
+ if (! -f $msgfile) {
+ my ($template, $fname);
+
+ $msgfile =~ m!.*/(.*)$!;
+ $fname = $1;
+ $template = "$TEMPLATEDIR/$fname";
+
+ if (-f $template) {
+ my $buf = util::readfile($template);
+ my $fh_output = util::efopen(">$msgfile");
+ print $fh_output $buf;
+ util::fclose($fh_output);
+ }
+ }
+}
+
+
+#
+# Make the NMZ.slog file for logging.
+#
+sub make_slog_file () {
+ if (! -f $var::NMZ{'slog'}) {
+ my $fh = util::efopen(">$var::NMZ{'slog'}");
+ util::fclose($fh);
+ undef $fh;
+ chmod 0666, $var::NMZ{'slog'};
+ }
+ {
+ my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
+ util::fclose($fh_slogfile);
+ }
+}
+
+
+#
+# Concatenate $CURRENTDIR to the head of each file.
+#
+sub change_filenames ($) {
+ my $dir = $var::OUTPUT_DIR;
+
+ for my $key (sort keys %var::NMZ) {
+ next if $key =~ /^_/; # exclude temporary file
+ $var::NMZ{$key} = "$dir/$var::NMZ{$key}";
+ }
+
+ # temporary files
+ for my $key (sort keys %var::NMZ) {
+ if ($key =~ /^_/) {
+ $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
+ }
+ }
+
+ if ($var::Opt{'debug'}) {
+ for my $key (sort keys %var::NMZ) {
+ util::dprint("NMZ: $var::NMZ{$key}\n");
+ }
+ }
+}
+
+
+#
+# Preparation processing for appending index files.
+#
+sub append_index (@) {
+ my @flist = @_;
+
+ my $docid_base = 0;
+ ($docid_base, @flist) = set_target_files(@flist);
+
+ unless (@flist) { # if @flist is empty
+ if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
+ set_lockfile($var::NMZ{'lock2'});
+ update_dateindex();
+ update_registry(0);
+ make_headfoot_pages(0, get_total_keys());
+ put_log(0, 0, 0, get_total_keys());
+ make_headfoot_pages(get_status("files"), get_status("keys"));
+ util::remove_tmpfiles();
+ }
+ print _("No files to index.\n");
+ exit 0;
+ }
+
+ $APPENDMODE = 1;
+ # conserve files by copying
+ copy($var::NMZ{'i'}, $var::NMZ{'_i'});
+ copy($var::NMZ{'w'}, $var::NMZ{'_w'});
+ copy($var::NMZ{'t'}, $var::NMZ{'_t'})
+ unless -f $var::NMZ{'_t'}; # preupdated ?
+ copy($var::NMZ{'p'}, $var::NMZ{'_p'});
+ copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
+
+ return ($docid_base, @flist);
+}
+
+#
+# Set target files to @flist and return with the number of regiested files.
+#
+sub set_target_files() {
+ my %rdocs; # 'rdocs' means 'registered documents'
+ my @found_files = @_;
+
+ # Load the list of registered documents
+ $rdocs{'name'} = load_registry();
+
+ # Pick up overlapped documents and do marking
+ my %mark1;
+ my @overlapped_files;
+ grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
+ $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
+ for my $overlapped (grep { $mark1{$_} } @found_files) {
+ $rdocs{'overlapped'}{$overlapped} = 1;
+ push @overlapped_files, $overlapped;
+ };
+
+ # Pick up not overlapped documents which are files to index.
+ my @flist = grep { ! $mark1{$_} } @found_files;
+
+ if ($var::Opt{'noupdate'}) {
+ return (scalar @{$rdocs{'name'}}, @flist);
+ };
+
+ # Load the date index.
+ $rdocs{'mtime'} = load_dateindex();
+
+ if (@{$rdocs{'mtime'}} == 0) {
+ return (scalar @{$rdocs{'name'}}, @flist);
+ };
+
+ util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
+ "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
+
+ # Pick up deleted documents and do marking
+ # (registered in the NMZ.r but not existent in the filesystem)
+ my @deleted_documents;
+ unless ($var::Opt{'nodelete'}) {
+ my %mark2;
+ grep { $mark2{$_}++ } @found_files;
+ for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
+ ! $rdocs{'overlapped'}{$_} }
+ @{$rdocs{'name'}})
+ {
+ $rdocs{'deleted'}{$deleted} = 1;
+ push @deleted_documents, $deleted;
+ }
+ }
+
+ # check filesize
+ if ($var::Opt{'checkfilesize'}) {
+ $rdocs{'size'} = load_sizefield();
+ }
+
+ # Pick up updated documents and set the missing number for deleted files.
+ my @updated_documents = pickup_updated_documents(\%rdocs);
+
+ # Append updated files to the list of files to index.
+ if (@updated_documents) {
+ push @flist, @updated_documents;
+ }
+
+ # Remove duplicates.
+ my %seen = ();
+ @flist = grep { ! $seen{$_}++ } @flist;
+
+ util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
+ util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
+ util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
+ util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
+ util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
+ util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
+
+ # Update NMZ.t with the missing number infomation and
+ # append updated files and deleted files to NMZ.r with leading '# '
+ if (@updated_documents || @deleted_documents) {
+ $DeletedFilesCount = 0;
+ $UpdatedFilesCount = 0;
+ $UpdatedFilesCount += @updated_documents;
+# $DeletedFilesCount += @updated_documents;
+ $DeletedFilesCount += @deleted_documents;
+ preupdate_dateindex(@{$rdocs{'mtime'}});
+ preupdate_registry(@updated_documents, @deleted_documents);
+ }
+
+ # Return the number of registered documents and list of files to index.
+ return (scalar @{$rdocs{'name'}}, @flist);
+}
+
+sub preupdate_registry(@) {
+ my (@list) = @_;
+
+ my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
+ @list = grep { s/(.*)/\# $1\n/ } @list;
+ print $fh_registry @list;
+ print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
+ util::fclose($fh_registry);
+}
+
+sub preupdate_dateindex(@) {
+ my @mtimes = @_;
+
+ # Since rewriting the entire file, it is not efficient,
+ # but simple and reliable. this would be revised in the future.
+ my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
+# print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
+ print $fh_dateindex pack("N*", @mtimes);
+ util::fclose($fh_dateindex);
+}
+
+sub update_registry ($) {
+ my ($docid_count) = @_;
+
+ {
+ my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
+ my $fh_registry_ = util::efopen($var::NMZ{'_r'});
+ while (defined(my $line = <$fh_registry_>)) {
+ print $fh_registry $line;
+ }
+ if ($docid_count > 0) {
+ print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
+ }
+ util::fclose($fh_registry_) if (defined $fh_registry_);
+ util::fclose($fh_registry);
+ }
+ unlink $var::NMZ{'_r'};
+}
+
+sub update_dateindex () {
+ util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
+}
+
+sub update_field_index () {
+ my @list = glob "$var::NMZ{'field'}.*.tmp";
+ for my $tmp (@list) {
+ if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
+ my $fname_tmp = $1;
+ my $fname_out = $2;
+ {
+ my $fh_field = util::efopen(">>$fname_out");
+ my $fh_tmp = util::efopen($fname_tmp);
+
+ while (defined(my $line = <$fh_tmp>)) {
+ print $fh_field $line;
+ }
+ util::fclose($fh_tmp) if (defined $fh_tmp);
+ util::fclose($fh_field);
+ }
+ unlink $fname_tmp;
+ } else {
+ util::cdie(_("update_field_index: ")."@list");
+ }
+ }
+}
+
+sub pickup_updated_documents (\%) {
+ my ($rdocs_ref) = @_;
+ my @updated_documents = ();
+
+ # To avoid duplicated outputs caused by --html-split support.
+ my %printed = ();
+ my $i = 0;
+ for my $cfile (@{$rdocs_ref->{'name'}}) {
+ if (defined($rdocs_ref->{'deleted'}{$cfile})) {
+ unless ($printed{$cfile}) {
+ print "$cfile " . _("was deleted!\n");
+ $printed{$cfile} = 1;
+ }
+ $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
+ } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
+ my $cfile_mtime = (stat($cfile))[9];
+ my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
+ my ($cfile_size, $rfile_size);
+ if ($var::Opt{'checkfilesize'}) {
+ $cfile_size = (stat($cfile))[7];
+ $rfile_size = $rdocs_ref->{'size'}[$i];
+ }
+
+ if ($rfile_mtime != $cfile_mtime ||
+ ($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
+ # The file is updated!
+ unless ($printed{$cfile}) {
+ print "$cfile " . _("was updated!\n");
+ $printed{$cfile} = 1;
+ }
+ push(@updated_documents, $cfile);
+ $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
+ }
+ }
+ $i++;
+ }
+
+ return @updated_documents
+}
+
+sub load_dateindex() {
+ my $fh_dateindex = util::efopen($var::NMZ{'t'});
+
+ my $size = -s $var::NMZ{'t'};
+ my $buf = "";
+ read($fh_dateindex, $buf, $size);
+ my @list = unpack("N*", $buf); # load date index
+# print "\nload_dateindex\n", join("\n", @list), "\n\n";
+
+ util::fclose($fh_dateindex);
+ return [ @list ];
+}
+
+sub load_registry () {
+ my $fh_registry = util::efopen($var::NMZ{'r'});
+
+ my @list = ();
+ my %deleted = ();
+ my @registered = ();
+
+ while (defined(my $line = <$fh_registry>)) {
+ chomp($line);
+ next if $line =~ /^\s*$/; # an empty line
+ next if $line =~ /^##/; # a comment
+ if ($line =~ s/^\#\s+//) { # deleted document
+ $deleted{$line}++;
+ } else {
+ # Remove HTML's anchor generated by --html-split option.
+ $line =~ s/\t.*$//g;
+ push @registered, $line;
+ }
+ }
+
+ util::fclose($fh_registry) if (defined $fh_registry);
+
+ # Exclude deleted documents.
+ for my $doc (@registered) {
+ if ($deleted{$doc}) {
+ push @list, "# $doc";
+ $deleted{$doc}--;
+ } else {
+ push @list, $doc;
+ }
+ }
+
+ return [ @list ];
+}
+
+# get file size information from NMZ.field.size
+sub load_sizefield() {
+ my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
+ return [] unless defined $fh_sizefield;
+ my $line;
+ my @ret = ();
+ while (defined($line = <$fh_sizefield>)) {
+ chomp $line;
+ push @ret, $line;
+ }
+ util::fclose($fh_sizefield) if (defined $fh_sizefield);
+ return \@ret;
+}
+
+sub get_total_keys() {
+ my $keys = get_status("keys");
+ $keys =~ s/,//g if (defined $keys);
+ $keys = 0 unless defined $keys;
+ return $keys;
+}
+
+sub get_total_files() {
+ my $files = get_status("files");
+ $files =~ s/,//g if (defined $files);
+ $files = 0 unless defined $files;
+ return $files;
+}
+
+sub get_status($) {
+ my ($key) = @_;
+
+ my $fh = util::fopen($var::NMZ{'status'});
+ return undef unless defined $fh;
+
+ while (defined(my $line = <$fh>)) {
+ if ($line =~ /^$key\s+(.*)$/) {
+ util::dprint("status: $key = $1\n");
+ $fh->close;
+ return $1;
+ }
+ }
+ util::fclose($fh) if (defined $fh);
+ return undef;
+}
+
+sub put_total_files($) {
+ my ($number) = @_;
+ $number =~ tr/,//d;
+ put_status("files", $number);
+}
+
+sub put_total_keys($) {
+ my ($number) = @_;
+ $number =~ tr/,//d;
+ put_status("keys", $number);
+}
+
+sub put_status($$) {
+ my ($key, $value) = @_;
+
+ # remove NMZ.status file if the file has a previous value.
+ unlink $var::NMZ{'status'} if defined get_status($key);
+
+ my $fh = util::efopen(">> $var::NMZ{'status'}");
+ print $fh "$key $value\n";
+ util::fclose($fh);
+}
+
+# do logging
+sub put_log ($$$$) {
+ my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
+
+ my $date = localtime;
+ my $added_files_count = $docid_count;
+ my $deleted_documents_count = $DeletedFilesCount;
+ my $updated_documents_count = $UpdatedFilesCount;
+ my $total_files_count = get_total_files() + $docid_count
+ - $DeletedFilesCount - $UpdatedFilesCount;
+ my $added_keys_count = 0;
+ $added_keys_count = $total_keys_count - get_total_keys();
+
+ my $processtime = time - $start_time;
+ $processtime = 0 if $start_time == 0;
+ $total_files_size = $total_files_size;
+ $total_keys_count = $total_keys_count;
+
+ my @logmsgs = ();
+ if ($APPENDMODE) {
+ push @logmsgs, N_("[Append]");
+ } else {
+ push @logmsgs, N_("[Base]");
+ }
+ push @logmsgs, N_("Date:"), "$date" if $date;
+ push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
+ if $added_files_count;
+ push @logmsgs, N_("Deleted Documents:"),
+ util::commas("$deleted_documents_count") if $deleted_documents_count;
+ push @logmsgs, N_("Updated Documents:"),
+ util::commas("$updated_documents_count") if $updated_documents_count;
+ push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
+ if $total_files_size;
+ push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count")
+ if $total_files_count;
+ push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count")
+ if $added_keys_count;
+ push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count")
+ if $total_keys_count;
+ push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
+ push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
+ if $processtime;
+ push @logmsgs, N_("File/Sec:"), sprintf "%.2f",
+ (($added_files_count + $updated_documents_count) / $processtime)
+ if $processtime;
+ push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
+ push @logmsgs, N_("Perl:"), sprintf("%f", $English::PERL_VERSION);
+ push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
+
+ my $log_for_file = "";
+
+ my $msg = shift @logmsgs; # [Base] or [Append]
+ # To stdout, use gettext.
+ print _($msg), "\n";
+ # To log file, do not use gettext.
+ $log_for_file = $msg . "\n";
+ while (@logmsgs) {
+ my $field = shift @logmsgs;
+ my $value = shift @logmsgs;
+ printf "%-20s %s\n", _($field), "$value";
+ $log_for_file .= sprintf "%-20s %s\n", $field, "$value";
+ }
+ print "\n";
+ $log_for_file .= "\n";
+
+ put_log_to_logfile($log_for_file);
+ put_total_files($total_files_count);
+ put_total_keys($total_keys_count);
+
+ my $argv = join "\t", @ARGV;
+ my $cwd = cwd();
+ put_status("argv", $argv);
+ put_status("cwd", $cwd);
+}
+
+sub put_log_to_logfile ($) {
+ my ($logmsg) = @_;
+ my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
+ print $fh_logfile $logmsg;
+ util::fclose($fh_logfile);
+}
+
+sub get_year() {
+ my $year = (localtime)[5] + 1900;
+
+ return $year;
+}
+
+# Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
+# Insert $docid_count, $key_count, and $month/$day/$year respectively.
+sub make_headfoot ($$$) {
+ my ($file, $docid_count, $key_count) = @_;
+
+ my $day = sprintf("%02d", (localtime)[3]);
+ my $month = sprintf("%02d", (localtime)[4] + 1);
+ my $year = get_year();
+ my $buf = "";
+
+ if (-f $file) {
+ $buf = util::readfile($file);
+ } else {
+ $file =~ m!.*/(.*)$!;
+ my $fname = $1;
+ my $template = "$TEMPLATEDIR/$fname";
+
+ if (-f $template) {
+ $buf = util::readfile($template);
+ } else {
+ return;
+ }
+ }
+
+ my $fh_file = util::efopen(">$file");
+
+ if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
+ my $total_files_count = util::commas(get_total_files() + $docid_count
+ - $DeletedFilesCount - $UpdatedFilesCount);
+ $buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
+
+ }
+ if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
+ my $tmp = $2;
+ $tmp =~ tr/,//d;
+ $tmp = $key_count;
+ $tmp = util::commas($tmp);
+ $buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
+ }
+ my $index_dir = basename($var::OUTPUT_DIR);
+ $buf =~ s#<!-- INDEX_DIR -->#$index_dir#gs;
+ $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
+ $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
+ $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
+ {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
+ $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
+ {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
+
+ print $fh_file $buf;
+ util::fclose($fh_file);
+}
+
+# Make phrase hashes for NMZ.p
+# Handle two words each for calculating a hash value ranged 0-65535.
+sub make_phrase_hash ($$$) {
+ my ($docid_count, $docid_base, $contref) = @_;
+
+ my %tmp = ();
+ $$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight
+ $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
+ my @words = split(/\s+/, $$contref);
+ @words = grep {$_ ne ""} @words; # remove empty words
+ my $word_b = shift @words;
+ my $docid = $docid_count + $docid_base;
+ for my $word (@words) {
+ next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
+ my $hash = hash($word_b . $word);
+ unless (defined $tmp{$hash}) {
+ $tmp{$hash} = 1;
+ $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
+ $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
+# util::dprint("<$word_b, $word> $hash\n");
+ $PhraseHashLast{$hash} = $docid;
+ }
+ $word_b = $word;
+ }
+}
+
+# Construct NMZ.p and NMZ.pi file. this processing is rather complex.
+sub write_phrase_hash () {
+ write_phrase_hash_sub();
+ util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
+ util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
+}
+
+sub write_phrase_hash_sub () {
+ my $opened = 0;
+
+ return 0 if %PhraseHash eq ''; # namazu-devel-ja #3146
+ util::dprint(_("doing write_phrase_hash() processing.\n"));
+
+ my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
+ my $fh_tmp_p = util::efopen(">$var::NMZ{'__p'}");
+
+ my $fh_phrase = util::fopen($var::NMZ{'_p'});
+ my $fh_phraseindex = undef;
+ if ($fh_phrase) {
+ $fh_phraseindex = util::efopen($var::NMZ{'_pi'});
+ $opened = 1;
+ }
+
+ my $ptr = 0;
+ for (my $i = 0; $i < 65536; $i++) {
+
+ my $baserecord = "";
+ my $baseleng = 0;
+
+ if ($opened) {
+ my $tmp = 0;
+ read($fh_phraseindex, $tmp, $var::INTSIZE);
+ $tmp = unpack("N", $tmp);
+ if ($tmp != 0xffffffff) { # 0xffffffff
+ $baseleng = readw($fh_phrase);
+ read($fh_phrase, $baserecord, $baseleng);
+ }
+ }
+ if (defined($PhraseHash{$i})) {
+ if ($baserecord eq "") {
+ print $fh_tmp_pi pack("N", $ptr);
+ my $record = $PhraseHash{$i};
+ my $n2 = length($record);
+ my $data = pack("w", $n2) . $record;
+ print $fh_tmp_p $data;
+ $ptr += length($data);
+ } else {
+ print $fh_tmp_pi pack("N", $ptr);
+ my $record = $PhraseHash{$i};
+ my $last_docid = get_last_docid($baserecord, 1);
+ my $adjrecord = adjust_first_docid($record, $last_docid);
+ check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
+ $record = $adjrecord;
+ my $n2 = length($record) + $baseleng;
+ my $data = pack("w", $n2) . $baserecord . $record;
+ print $fh_tmp_p $data;
+ $ptr += length($data);
+ }
+ } else {
+ if ($baserecord eq "") {
+ # if $baserecord has no data, set to 0xffffffff
+ print $fh_tmp_pi pack("N", 0xffffffff);
+ } else {
+ print $fh_tmp_pi pack("N", $ptr);
+ my $data = pack("w", $baseleng) . $baserecord;
+ print $fh_tmp_p $data;
+ $ptr += length($data);
+ }
+ }
+ }
+
+ if ($opened) {
+ util::fclose($fh_phraseindex);
+ }
+ if (defined $fh_phrase) {
+ util::fclose($fh_phrase);
+ }
+ util::fclose($fh_tmp_p);
+ util::fclose($fh_tmp_pi);
+
+ %PhraseHash = ();
+ %PhraseHashLast = ();
+}
+
+# Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998)
+sub hash ($) {
+ my ($word) = @_;
+
+ my $hash = 0;
+ for (my $i = 0; $word ne ""; $i++) {
+ $hash ^= $Seed[$i & 0x03][ord($word)];
+ $word = substr $word, 1;
+ # $word =~ s/^.//; is slower
+ }
+ return $hash & 65535;
+}
+
+# Count frequencies of words.
+sub count_words ($$$$) {
+ my ($docid_count, $docid_base, $contref, $kanji) = @_;
+ my (@tmp);
+
+ # Normalize into small letter.
+ $$contref =~ tr/A-Z/a-z/;
+
+ # Remove control char.
+ $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
+
+ # It corresponds to -j option of ChaSen.
+ $$contref =~ s/^[ \t\f]+//gm; # except "\r\n"
+ $$contref =~ s/[ \t\f]+$//gm; # except "\r\n"
+ $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi; # for hyphenation
+ if (util::islang("ja")) {
+ $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
+ $$contref =~ s/(¡£|¡¢)/$1\n/gs;
+ }
+ $$contref =~ s/\n+/\n/gs;
+
+ # Do wakatigaki if necessary.
+ if (util::islang("ja")) {
+ wakati::wakatize_japanese($contref) if $kanji;
+ }
+
+ my $part1 = "";
+ my $part2 = "";
+ if ($$contref =~ /\x7f/) {
+ $part1 = substr $$contref, 0, index($$contref, "\x7f");
+ $part2 = substr $$contref, index($$contref, "\x7f");
+# $part1 = $PREMATCH; # $& and friends are not efficient
+# $part2 = $MATCH . $POSTMATCH;
+ } else {
+ $part1 = $$contref;
+ $part2 = "";
+ }
+
+ # do scoring
+ my %word_count = ();
+ $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
+ wordcount_sub($2, $1, \%word_count)!ge;
+ wordcount_sub($part1, 1, \%word_count);
+
+ # Add them to whole index
+ my $docid = $docid_count + $docid_base;
+ for my $word (keys(%word_count)) {
+ next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
+ $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
+ $KeyIndex{$word} .= pack("w2",
+ $docid - $KeyIndexLast{$word},
+ $word_count{$word});
+ $KeyIndexLast{$word} = $docid;
+ }
+}
+
+#
+# Count words and do score weighting
+#
+sub wordcount_sub ($$\%) {
+ my ($text, $weight, $word_count) = @_;
+
+ # Remove all symbols when -K option is specified.
+ $text =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'};
+
+ # Count frequencies of words in a current document.
+ # Handle symbols as follows.
+ #
+ # tcp/ip -> tcp/ip, tcp, ip
+ # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
+ # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
+ #
+ # Don't do processing for nested symbols.
+ # NOTE: When -K is specified, all symbols are already removed.
+
+ my @words = split /\s+/, $text;
+ for my $word (@words) {
+ next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
+ if ($var::Opt{'noedgesymbol'}) {
+ # remove symbols at both ends
+ $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
+ }
+ $word_count->{$word} = 0 unless defined($word_count->{$word});
+ $word_count->{$word} += $weight;
+ unless ($var::Opt{'nosymbol'}) {
+ if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ }
+ my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
+ if $word =~ /[^\xa1-\xfea-z_0-9]/;
+ for my $tmp (@words_) {
+ next if $tmp eq "";
+ $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
+ $word_count->{$tmp} += $weight;
+ }
+ @words_ = ();
+ }
+ }
+ return "";
+}
+
+# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
+sub write_index () {
+ my $key_count = write_index_sub();
+ util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
+ util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
+
+ return $key_count;
+}
+
+# readw: read one pack 'w' word.
+# This code was contributed by <furukawa@tcp-ip.or.jp>.
+sub readw ($) {
+ my $fh = shift;
+ my $ret = '';
+ my $c;
+
+ while (read($fh, $c, 1)){
+ $ret .= $c;
+ last unless 0x80 & ord $c;
+ }
+ return unpack('w', $ret);
+}
+
+sub get_last_docid ($$) {
+ my ($record, $step) = @_;
+ my (@data) = unpack 'w*', $record;
+
+ my $sum = 0;
+ for (my $i = 0; $i < @data; $i += $step) {
+ $sum += $data[$i];
+ }
+ my $leng = @data / $step;
+ return $sum;
+}
+
+sub adjust_first_docid ($$) {
+ my ($record, $last_docid) = @_;
+ my (@data) = unpack 'w*', $record;
+
+ $data[0] = $data[0] - $last_docid;
+ return undef if ($data[0] < 0); # namazu-bug-ja#31
+ $record = pack 'w*', @data;
+ return $record;
+}
+
+sub write_index_sub () {
+ my @words = sort keys(%KeyIndex);
+ return 0 if $#words == -1;
+
+ my $cnt = 0;
+ my $ptr_i = 0;
+ my $ptr_w = 0;
+ my $key_count = 0;
+ my $baserecord = "";
+
+ util::dprint(_("doing write_index() processing.\n"));
+ my $fh_tmp_i = util::efopen(">$var::NMZ{'__i'}");
+ my $fh_tmp_w = util::efopen(">$var::NMZ{'__w'}");
+ my $fh_i = util::fopen($var::NMZ{'_i'});
+ my $fh_ii = util::efopen(">$var::NMZ{'_ii'}");
+ my $fh_w = util::fopen($var::NMZ{'_w'});
+ my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
+
+ if ($fh_w) {
+ FOO:
+ while (defined(my $line = <$fh_w>)) {
+ chop $line;
+ my $current_word = $line;
+
+ my $baseleng = readw($fh_i);
+ read($fh_i, $baserecord, $baseleng);
+
+ for (; $cnt < @words; $cnt++) {
+ last unless $words[$cnt] le $current_word;
+ my $record = $KeyIndex{$words[$cnt]};
+ my $leng = length($record);
+
+ if ($current_word eq $words[$cnt]) {
+ my $last_docid = get_last_docid($baserecord, 2);
+ my $adjrecord = adjust_first_docid($record, $last_docid);
+ check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
+ $record = $adjrecord;
+ $leng = length($record); # re-measure
+ my $tmp = pack("w", $leng + $baseleng);
+
+ my $data_i = "$tmp$baserecord$record";
+ my $data_w = "$current_word\n";
+ print $fh_tmp_i $data_i;
+ print $fh_tmp_w $data_w;
+ print $fh_ii pack("N", $ptr_i);
+ print $fh_wi pack("N", $ptr_w);
+ $ptr_i += length($data_i);
+ $ptr_w += length($data_w);
+ $key_count++;
+
+ $cnt++;
+ next FOO;
+ } else {
+ my $tmp = pack("w", $leng);
+ my $data_i = "$tmp$record";
+ my $data_w = "$words[$cnt]\n";
+ print $fh_tmp_i $data_i;
+ print $fh_tmp_w $data_w;
+ print $fh_ii pack("N", $ptr_i);
+ print $fh_wi pack("N", $ptr_w);
+ $ptr_i += length($data_i);
+ $ptr_w += length($data_w);
+ $key_count++;
+ }
+ }
+ my $tmp = pack("w", $baseleng);
+ my $data_i = "$tmp$baserecord";
+ my $data_w = "$current_word\n";
+ print $fh_tmp_i $data_i;
+ print $fh_tmp_w $data_w;
+ print $fh_ii pack("N", $ptr_i);
+ print $fh_wi pack("N", $ptr_w);
+ $ptr_i += length($data_i);
+ $ptr_w += length($data_w);
+ $key_count++;
+ }
+ }
+ while ($cnt < @words) {
+ my $leng = length($KeyIndex{$words[$cnt]});
+ my $tmp = pack("w", $leng);
+ my $record = $KeyIndex{$words[$cnt]};
+
+ my $data_i = "$tmp$record";
+ my $data_w = "$words[$cnt]\n";
+ print $fh_tmp_i $data_i;
+ print $fh_tmp_w $data_w;
+ print $fh_ii pack("N", $ptr_i);
+ print $fh_wi pack("N", $ptr_w);
+ $ptr_i += length($data_i);
+ $ptr_w += length($data_w);
+ $key_count++;
+ $cnt++;
+ }
+ %KeyIndex = ();
+ %KeyIndexLast = ();
+
+ util::fclose($fh_wi);
+ util::fclose($fh_w) if (defined $fh_w);
+ util::fclose($fh_ii);
+ util::fclose($fh_i) if (defined $fh_i);
+ util::fclose($fh_tmp_w);
+ util::fclose($fh_tmp_i);
+
+ return $key_count;
+}
+
+#
+# Decide the media type.
+# FIXME: Very ad hoc. It's just a compromise. -- satoru
+#
+sub decide_type ($$) {
+ my ($name, $cont) = @_;
+ return $name if (!defined $cont || $name eq $cont);
+
+ util::dprint("decide_type: name: $name, cont: $cont\n");
+ if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
+ return $name;
+ } elsif ($cont =~ m!^application/octet-stream! &&
+ $name !~ m!^text/!) {
+ return $name;
+ } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
+ $name !~ m!^application/octet-stream!) {
+ # FIXME: Currently File::MMagic 1.02's checktype_data()
+ # is unreliable for them.
+ return $name;
+ } elsif ($cont =~ m!^application/x-zip! &&
+ $name =~ m!^application/!) {
+ # zip format is used other applications e.g. OpenOffice.
+ # It is necessary to add to check extention.
+ return $name;
+ }
+
+ return $cont;
+}
+
+#
+# Debugging code for the "negative numbers" problem.
+#
+sub check_records ($$$) {
+ my ($recref, $baserecref, $step) = @_;
+ dump_record($baserecref, $step);
+ dump_record($recref, $step);
+ print STDERR "The \x22negative number\x22 problem occurred.\n";
+ exit(1);
+}
+
+sub dump_record($$) {
+ my ($recref, $step) = @_;
+ my (@data) = unpack 'w*', $$recref;
+ print STDERR "dump record data to NMZ.bug.info (step: $step)...";
+ my $fh_info = util::fopen(">> NMZ.bug.info");
+ print $fh_info "dumped record data (step: $step)...";
+ foreach (@data) {
+ print $fh_info sprintf(" %08x", $_);
+ }
+ print $fh_info "\n";
+ util::fclose($fh_info);
+ return;
+}
+
+sub trapintr {
+ my ($signame) = @_;
+ print STDERR "Warning: signal $signame occured.\n";
+}
+
+#
+# For avoiding "used only once: possible typo at ..." warnings.
+#
+muda($conf::ON_MEMORY_MAX,
+ $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
+ $conf::DENY_FILE, $var::INTSIZE,
+ $conf::CHASEN_NOUN, $conf::CHASEN,
+ $conf::KAKASI, $var::Opt{'okurigana'},
+ $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
+ $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
+ $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
+ $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
+ $conf::ADDRESS, $var::MAILING_ADDRESS,
+ $conf::FILE_SIZE_MAX,
+ $conf::MECAB,
+ $conf::DENY_DDN,
+ $var::TRAC_URI,
+ );
+
+sub muda {}
+