#! /usr/bin/perl -w # -*- Perl -*- # mknmz - indexer of Namazu # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $ # # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. # Copyright (C) 2000-2008 Namazu Project All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # package mknmz; require 5.004; use English; use lib "."; use Cwd; use IO::File; use File::Find; use File::MMagic; use Time::Local; use strict; # be strict since v1.2.0 use Getopt::Long; use File::Copy; use DirHandle; use File::Basename; use vars qw($SYSTEM); # It exists only for back compatibility. $SYSTEM = $English::OSNAME; my $NAMAZU_INDEX_VERSION = "2.0"; my $CodingSystem = "euc"; my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu"; my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in. my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in. my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in. my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in. my $DeletedFilesCount = 0; my $UpdatedFilesCount = 0; my $APPENDMODE = 0; my %PhraseHash = (); my %PhraseHashLast = (); my %KeyIndex = (); my %KeyIndexLast = (); my %CheckPoint = ("on" => undef, "continue" => undef); my $ConfigFile = undef; my $MediaType = undef; my $ReplaceCode = undef; # perl code for transforming URI my @Seed = (); my @LoadedRcfiles = (); my $Magic = new File::MMagic; my $ReceiveTERM = 0; STDOUT->autoflush(1); STDERR->autoflush(1); main(); sub main { my $start_time = time; if ($English::PERL_VERSION == 5.008001) { unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) { print "Run mknmz with the environment variable PERL_HASH_SEED=0\n"; exit 1; } } init(); # At first, loading pl/conf.pl to prevent overriding some variables. preload_modules(); # set LANG and bind textdomain util::set_lang(); textdomain('namazu', $util::LANG_MSG); load_modules(); my ($output_dir, @targets) = parse_options(); my ($docid_base, $total_files_num) = prep($output_dir, @targets); my $swap = 1; my $docid_count = 0; my $file_count = 0; my $total_files_size = 0; my $key_count = 0; my $checkpoint = 0; my $flist_ptr = 0; my $processed_files_size = 0; if ($CheckPoint{'continue'}) { # Restore variables eval util::readfile($var::NMZ{'_checkpoint'}) ; } else { print $total_files_num . _(" files are found to be indexed.\n"); } { my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}"); my $fh_flist = util::efopen($var::NMZ{'_flist'}); my %field_indices = (); get_field_index_base(\%field_indices); if ($CheckPoint{'continue'}) { seek($fh_flist, $flist_ptr, 0); } # Process target files one by one while (defined(my $line = <$fh_flist>)) { $flist_ptr += length($line); my $cfile = $line; chomp $cfile; util::dprint(_("target file: ")."$cfile\n"); my ($cfile_size, $num) = process_file($cfile, $docid_count, $docid_base, $file_count, \%field_indices, $fh_errorsfile, $total_files_num); if ($num == 0) { $total_files_num--; next; } else { $docid_count += $num; $file_count++; } $total_files_size += $cfile_size; $processed_files_size += $cfile_size; last if $ReceiveTERM; if ($processed_files_size > $conf::ON_MEMORY_MAX) { if (%KeyIndex) { $key_count = write_index(); print _("Writing index files..."); write_phrase_hash(); print "\n"; } $processed_files_size = 0; $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>); } } util::fclose($fh_flist); util::fclose($fh_errorsfile); } # This should be out of above blocks because of file handler closing. re_exec($flist_ptr, $docid_count, $docid_base, $start_time, $total_files_size, $total_files_num, $file_count, $key_count) if $checkpoint; if (%KeyIndex) { $key_count = write_index(); print _("Writing index files..."); write_phrase_hash(); print "\n"; } $key_count = get_total_keys() unless $key_count; do_remain_job($total_files_size, $docid_count, $key_count, $start_time); exit 0; } # # FIXME: Very complicated. # sub process_file ($$$$\%$$) { my ($cfile, $docid_count, $docid_base, $file_count, $field_indices, $fh_errorsfile, $total_files_num) = @_; my $processed_num = 0; my $file_size = util::filesize($cfile); if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) { my @parts; @parts = htmlsplit::split($cfile, "NMZ.partial") if ($file_size <= $conf::FILE_SIZE_MAX); if (@parts > 1) { my $id = 0; for my $part (@parts) { next if (defined $conf::EXCLUDE_PATH && "$cfile#$part" =~ /$conf::EXCLUDE_PATH/); my $fname = util::tmpnam("NMZ.partial.$id"); my $fragment = defined $part ? $part : undef; my $uri = generate_uri($cfile, $fragment); my $result = namazu_core($fname, $docid_count + $processed_num, $docid_base, $file_count, $field_indices, $fh_errorsfile, $total_files_num, $uri, $id, $#parts); if ($result > 0) { $processed_num++; my $rname = defined $part ? "$cfile\t$part" : "$cfile"; put_registry($rname); } unlink $fname; $id++; } return ($file_size, $processed_num); } } my $result = namazu_core($cfile, $docid_count, $docid_base, $file_count, $field_indices, $fh_errorsfile, $total_files_num, undef, undef, undef); if ($result > 0) { $processed_num++; put_registry($cfile); } return ($file_size, $processed_num); } # # Load mknmzrcs: # # 1. MKNMZRC environment # # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc # # 3. ~/.mknmzrc # # 4. user-specified mknmzrc set by mknmz --config=file option. # # If multiple files exists, read all of them. # sub load_rcfiles () { my (@cand) = (); # To support Windows. Since they have nasty drive letter convention, # it is necessary to change mknmzrc dynamically with env. variable. push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'}; push @cand, "$CONFDIR/mknmzrc"; push @cand, "$ENV{'HOME'}/.mknmzrc"; util::vprint(_("Reading rcfile: ")); for my $rcfile (@cand) { if (-f $rcfile) { load_rcfile ($rcfile); util::vprint(" $rcfile"); } } util::vprint("\n"); } sub load_rcfile ($) { my ($rcfile) = @_; if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { util::win32_yen_to_slash(\$rcfile); } return if (grep {m/^$rcfile$/} @LoadedRcfiles); do $rcfile; if ($@) { chop $@; push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'"; }else { push @LoadedRcfiles, $rcfile; } # Dirty workaround. $LIBDIR = $conf::LIBDIR if (defined $conf::LIBDIR && -d $conf::LIBDIR); $FILTERDIR = $conf::FILTERDIR if (defined $conf::FILTERDIR && -d $conf::FILTERDIR); $TEMPLATEDIR = $conf::TEMPLATEDIR if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR); } sub re_exec($$$$$$$$) { my ($flist_ptr, $docid_count, $docid_base, $start_time, $total_files_size, $total_files_num, $file_count, $key_count) = @_; # store variables { my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}"); print $fh_checkpoint <>$var::NMZ{'_r'}"); print $fh_registry $filename, "\n"; util::fclose($fh_registry); } # Initialization # $CodingSystem: Character Coding System 'euc' or 'sjis' sub init () { if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { $CodingSystem = "sjis"; if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { $CONFDIR = $1 . $CONFDIR ; } if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { $LIBDIR = $1 . $LIBDIR ; } if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { $FILTERDIR = $1 . $FILTERDIR ; } if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { $TEMPLATEDIR = $1 . $TEMPLATEDIR ; } } else { $CodingSystem = "euc"; } $SIG{'INT'} = sub { util::cdie("SIGINT caught! Aborted.\n"); }; $SIG{'TERM'} = sub { print STDERR "SIGTERM caught!\n"; $ReceiveTERM = 1; }; } sub preload_modules () { unshift @INC, $LIBDIR; # workaround for test suites. unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'}; require "var.pl" || die "unable to require \"var.pl\"\n"; require "conf.pl" || die "unable to require \"conf.pl\"\n"; require "util.pl" || die "unable to require \"util.pl\"\n"; require "gettext.pl" || die "unable to require \"gettext.pl\"\n"; require "ext.pl" || die "unable to require \"ext.pl\"\n"; } sub postload_modules () { require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n"; } sub load_modules () { require "usage.pl" || die "unable to require \"usage.pl\"\n"; require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n"; require "wakati.pl" || die "unable to require \"wakati.pl\"\n"; require "seed.pl" || die "unable to require \"seed.pl\"\n"; require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n"; @Seed = seed::init(); } sub load_filtermodules () { unshift @INC, $FILTERDIR; # # Windows modules must be loaded first. # Because OLE filters have low precedence over normal ones. # load_win32modules() if $English::OSNAME eq "MSWin32"; # Check filter modules my @filters = (); @filters = glob "$FILTERDIR/*.pl"; load_filters(@filters); } sub load_win32modules () { # Check filter modules my @filters = (); if (-f "../filter/win32/olemsword.pl") { # to ease developing @filters = glob "../filter/win32/*.pl"; unshift @INC, "../filter/win32"; } else { @filters = glob "$FILTERDIR/win32/*.pl"; unshift @INC, "$FILTERDIR/win32"; } load_filters(@filters); } sub load_filters (@) { my @filters = @_; for my $filter (@filters) { $filter =~ m!([-\w]+)\.pl$!; my $module = $1; require "$module.pl" || die "unable to require \"$module.pl\"\n";; my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv); eval "\@mtypes = ${module}::mediatype();"; die $@ if $@; # eval error eval "\$status = ${module}::status();"; die $@ if $@; eval "\$recursive = ${module}::recursive();"; die $@ if $@; eval "\$pre_codeconv = ${module}::pre_codeconv();"; die $@ if $@; eval "\$post_codeconv = ${module}::post_codeconv();"; die $@ if $@; eval "${module}::add_magic(\$Magic);"; die $@ if $@; for my $mt (@mtypes) { next if (defined $var::Supported{$mt} && $var::Supported{$mt} eq 'yes' && $status eq 'no'); $var::Supported{$mt} = $status; $var::REQUIRE_ACTIONS{$mt} = $module; $var::RECURSIVE_ACTIONS{$mt} = $recursive; $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv; $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv; } } } # Core routine. # # FIXME: Too many parameters. They must be cleared. # sub namazu_core ($$$$$$$$$$) { my ($cfile, $docid_count, $docid_base, $file_count, $field_indices, $fh_errorsfile, $total_files_num, $uri, $part_id, $part_num) = @_; my $headings = ""; my $content = ""; my $weighted_str = ""; my %fields; my $msg_prefix; if ($part_id) { $msg_prefix = " $part_id/$part_num - "; } else { $msg_prefix = $file_count + 1 . "/$total_files_num - "; } unless ($uri) { $uri = generate_uri($cfile); # Make a URI from a file name. } my ($cfile_size, $text_size, $kanji, $mtype) = load_document(\$cfile, \$content, \$weighted_str, \$headings, \%fields); { $fields{'mtime'} = (stat($cfile))[9]; my $utc = $fields{'mtime'}; $utc = time::rfc822time_to_mtime($fields{'date'}) if (defined $fields{'date'}); if ($utc == -1) { my $date = $fields{'date'}; print "$cfile Illegal date format. : $date\n"; print $fh_errorsfile "$cfile Illegal date format. : $date\n"; $utc = $fields{'mtime'}; delete $fields{'date'}; } $fields{'utc'} = $utc; } util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n"); # Check if the file is acceptable. my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri); if (defined $err) { if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { my $uri2 = codeconv::eucjp_to_shiftjis($uri); print $msg_prefix . "$uri2 $err\n"; } else { print $msg_prefix . "$uri $err\n"; } print $fh_errorsfile "$cfile $err\n"; return 0; # return 0 if error } # Print processing file name as URI. if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { my $uri2 = codeconv::eucjp_to_shiftjis($uri); print $msg_prefix . "$uri2 [$mtype]\n"; } else { print $msg_prefix . "$uri [$mtype]\n"; } # Add filename. my $filename = defined $cfile ? $cfile : ''; codeconv::toeuc(\$filename); $filename = basename($filename); $fields{'filename'} = $filename; complete_field_info(\%fields, $cfile, $uri, \$headings, \$content, \$weighted_str); put_field_index(\%fields, $field_indices); put_dateindex($cfile); $content .= "\n\n$filename\n\n"; # add filename $content .= $weighted_str; # add weights count_words($docid_count, $docid_base, \$content, $kanji); make_phrase_hash($docid_count, $docid_base, \$content); # assertion util::assert($cfile_size != 0, "cfile_size == 0 at the end of namazu_core."); return $cfile_size; } # # Make the URI from the given file name. # sub generate_uri (@) { my ($file, $fragment) = @_; return "" unless defined $file; # omit a file name if omittable $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; if (defined $ReplaceCode) { # transforming URI by evaling $_ = $file; eval $ReplaceCode; $file = $_; } if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C| } if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { $file = codeconv::shiftjis_to_eucjp($file); } if (defined $fragment) { codeconv::toeuc(\$fragment); } unless ($var::Opt{'noencodeuri'}) { for my $tmp ($file, $fragment) { next unless defined $tmp; # Escape unsafe characters (not strict) $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2 $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/ sprintf("%%%02X",ord($1))/ge; } } my $uri = $file; $uri .= "#" . $fragment if defined $fragment; if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { # restore '|' for drive letter rule of Win32, OS/2 $uri =~ s!^/([A-Z])%7C!/$1|!i; } return $uri; } sub get_field_index_base (\%) { my ($field_indices) = @_; my @keys = split('\|', $conf::SEARCH_FIELD); if ($var::Opt{'meta'}) { push @keys, (split '\|', $conf::META_TAGS); } for my $key (@keys) { $key = lc($key); my $fname = "$var::NMZ{'field'}.$key"; my $tmp_fname = util::tmpnam("NMZ.field.$key"); my $size = 0; $size = -s $fname if -f $fname; $size += -s $tmp_fname if -f $tmp_fname; $field_indices->{$key} = $size; } } sub complete_field_info (\%$$\$\$\$) { my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_; for my $field (keys %{$fields}) { if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) { delete $fields->{$field}; } } unless (defined($fields->{'title'})) { $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref); } unless (defined($fields->{'date'})) { my $mtime = $fields->{'mtime'}; my $date = util::rfc822time($mtime); $fields->{'date'} = $date; } unless (defined($fields->{'uri'})) { $fields->{'uri'} = $uri; } unless (defined($fields->{'size'})) { $fields->{'size'} = -s $cfile; } unless (defined($fields->{'summary'})) { $fields->{'summary'} = make_summary($contref, $headings, $cfile); } unless (defined($fields->{'from'}) || defined($fields->{'author'})) { $fields->{'from'} = getmsg("unknown"); } } # # Currently, messages for NMZ.* files should be encoded in # EUC-JP currently. We cannot use gettext.pl for the messsage # because gettext.pl may use Shift_JIS encoded messages. # So, we should use the function instead of gettext(). # # FIXME: Ad hoc impl. getmsg() is effective only for "unknown". # sub getmsg($) { my ($msg) = @_; if (util::islang_msg("ja")) { if ($msg eq "unknown") { return "ÉÔÌÀ"; } } return $msg; } sub make_summary ($$$) { my ($contref, $headings, $cfile) = @_; # pick up $conf::MAX_FIELD_LENGTH bytes string my $tmp = ""; if ($$headings ne "") { $$headings =~ s/^\s+//; $$headings =~ s/\s+/ /g; $tmp = $$headings; } else { $tmp = ""; } my $offset = 0; my $tmplen = 0; while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0 && $offset < length($$contref)) { $tmp .= substr $$contref, $offset, $tmplen; $offset += $tmplen; $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge; $tmp =~ s/([-=*\#])\1{2,}/$1$1/g; } # -1 means "LF" my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1; # Remove a garbage Kanji 1st char at the end. $summary = codeconv::chomp_eucjp($summary); $summary =~ s/^\s+//; $summary =~ s/\s+/ /g; # normalize white spaces return $summary; } # output the field infomation into NMZ.fields.* files sub put_field_index (\%$) { my ($fields, $field_indices) = @_; my $aliases_regex = join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES); for my $field (keys %{$fields}) { util::dprint("Field: $field: $fields->{$field}\n"); if ($field =~ /^($aliases_regex)$/o) { unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) { $fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field}; } undef $fields->{$field}; } } my @keys = split '\|', $conf::SEARCH_FIELD; if ($var::Opt{'meta'}) { my @meta = split '\|', $conf::META_TAGS; while (my $meta = shift(@meta)) { $meta = $conf::FIELD_ALIASES{$meta} if (defined $conf::FIELD_ALIASES{$meta}); push @keys, $meta; } # uniq @keys my %mark = (); @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys; } for my $key (@keys) { my $lkey = lc($key); my $fname = util::tmpnam("NMZ.field.$lkey"); my $fh_field = util::efopen(">>$fname"); my $output = ""; if (defined($fields->{$key})) { if ($key ne 'uri') { # workaround for namazu-bugs-ja#30 $fields->{$key} =~ s/\s+/ /g; $fields->{$key} =~ s/\s+$//; $fields->{$key} =~ s/^\s+//; } $output = $fields->{$key}; # -1 means "LF" $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1; # Remove a garbage Kanji 1st char at the end. $output = codeconv::chomp_eucjp($output); $output =~ s/\n.*$//s; $output .= "\n"; } else { $output = "\n"; } print $fh_field $output; util::fclose($fh_field); # put index of field index { my $fname = util::tmpnam("NMZ.field.$lkey.i"); my $fh_field_idx = util::efopen(">>$fname"); print $fh_field_idx pack("N", $field_indices->{$lkey}); $field_indices->{$lkey} += length $output; util::fclose($fh_field_idx); } } } # put the date infomation into NMZ.t file sub put_dateindex ($) { my ($cfile) = @_; my $mtime = (stat($cfile))[9]; my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}"); print $fh_dataindex pack("N", $mtime); util::fclose($fh_dataindex); } # load a document file sub load_document ($$$$\%) { my ($orig_cfile, $contref, $weighted_str, $headings, $fields) = @_; my $cfile = $$orig_cfile; return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile)); # for handling a filename which contains Shift_JIS code for Windows. # for handling a filename which contains including space. my $shelter_cfile = ""; if (($cfile =~ /\s/) || ($English::OSNAME eq "MSWin32" && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) ) { $shelter_cfile = $cfile; $cfile = util::tmpnam("NMZ.win32"); unlink $cfile if (-e $cfile); copy($shelter_cfile, $cfile); } my $file_size; $file_size = util::filesize($cfile); # not only file in feature. if ($file_size > $conf::FILE_SIZE_MAX) { return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max'); } $$contref = util::readfile($cfile); # $file_size = length($$contref); my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef); if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") { unlink $cfile; $cfile = $shelter_cfile; } # Measure the text size at this time. my $text_size = length($$contref) + length($$weighted_str); return ($file_size, $text_size, $kanji, $mtype); } sub apply_filter($$$$$$$) { my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype) = @_; my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile; # Filtering process. my $mtype; my $called_dt = 0; while (1) { if (defined $MediaType) { $mtype = $MediaType; } elsif (defined $mmtype) { $mtype = $mmtype; } else { my $mtype_n = $Magic->checktype_byfilename($cfile); my $mtype_c = $Magic->checktype_data($$contref); my $mtype_m; $mtype_m = $Magic->checktype_magic($$contref) if ((! defined $mtype_c) || $mtype_c =~ /^(text\/html|text\/plain|application\/octet-stream)$/); $mtype_c = $mtype_m if (defined $mtype_m && $mtype_m !~ /^(text\/html|text\/plain|application\/octet-stream)$/); $mtype_c = 'text/plain' unless defined $mtype_c; if ($called_dt) { $mtype = $mtype_c; } else { $mtype = decide_type($mtype_n, $mtype_c); $called_dt = 1; } } util::dprint(_("Detected type: ")."$mtype\n"); # Pre code conversion. if ($var::REQUIRE_PRE_CODECONV{$mtype}) { util::dprint("pre_codeconv\n"); codeconv_document($contref); } if (! $var::Supported{$mtype} || $var::Supported{$mtype} ne 'yes') { util::vprint(_("Unsupported media type ")."$mtype\n"); return (0, "$mtype; x-system=unsupported"); } if ($var::REQUIRE_ACTIONS{$mtype}) { util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n"); require $var::REQUIRE_ACTIONS{$mtype}.'.pl' || die _("unable to require ") . "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n"; my $err = undef; { local $SIG{'PIPE'} = \&trapintr; eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} . '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);'; } if ($err) { if ($err =~ m/; x-system=unsupported$/) { return (0, $err); } return (0, "$mtype; x-error=$err"); } if ($@) { util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n"); return (0, "$mtype; x-error=$@"); } # Post code conversion. if ($var::REQUIRE_POST_CODECONV{$mtype}) { util::dprint("post_codeconv\n"); codeconv_document($contref); } next if ($var::RECURSIVE_ACTIONS{$mtype}); } last; } my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained? $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/; return ($kanji, $mtype); } sub codeconv_document ($) { my ($textref) = @_; codeconv::toeuc($textref); $$textref =~ s/\r\n/\n/g; $$textref =~ s/\r/\n/g; $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char. } sub prep () { my $docid_base = 0; my $output_dir = shift @_ ; my @targets = @_ ; my @flist = (); $var::OUTPUT_DIR = $output_dir; require_modules(); change_filenames(); check_present_index(); # if Checkpoint mode, return return (0, 0) if $CheckPoint{'continue'}; check_lockfile($var::NMZ{'lock2'}); print _("Looking for indexing files...\n"); @flist = find_target(@targets); ($docid_base, @flist) = append_index(@flist) if -f $var::NMZ{'r'}; unless (@flist) { # if @flist is empty print _("No files to index.\n"); exit 0; } set_lockfile($var::NMZ{'lock2'}); save_flist(@flist); my $total_files_num = @flist; return ($docid_base, $total_files_num); } sub save_flist(@) { my @flist = @_; return if (@flist == 0); my $fh_flist = util::efopen(">$var::NMZ{'_flist'}"); print $fh_flist join("\n", @flist), "\n"; util::fclose($fh_flist); } sub require_modules() { if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) { require NKF || die "unable to require \"NKF\"\n"; util::dprint(_("code conversion: using NKF module\n")); $var::USE_NKF_MODULE = 1; } if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) { require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n"; util::dprint(_("wakati: using Text::Kakasi module\n")); my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w'); } if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) { require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n"; util::dprint(_("wakati: using Text::ChaSen module\n")); my @arg = ('-i', 'e', '-j', '-F', '%m '); @arg = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'}; my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg); } if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) { require MeCab || die "unable to require \"MeCab\"\n"; util::dprint(_("wakati: using MeCab module\n")); } } sub check_lockfile ($) { # warn if check file exists in case other process is running or abnormal # stop execution (later is not the major purpose, though). # This is mainly for early detection before longish find_target. my ($file) = @_; if (-f $file) { print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n"); exit 1; } } sub set_lockfile ($) { my ($file) = @_; # make a lock file if (-f $file) { print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n"; exit 1; } else { my $fh_lockfile = util::efopen(">$file"); print $fh_lockfile "$$"; # save pid util::fclose($fh_lockfile); } } sub remove_lockfile ($) { my ($file) = @_; # remove lock file unlink $file if -f $file; } # check present index whether it is old type of not sub check_present_index () { if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}") { util::cdie(_("Present index is old type. it's unsupported.\n")); } } # remain sub do_remain_job ($$$$) { my ($total_files_size, $docid_count, $key_count, $start_time) = @_; if ($docid_count == 0) { # No files are indexed if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) { update_dateindex(); update_registry($docid_count); } } else { set_lockfile($var::NMZ{'lock'}); write_version(); write_body_msg(); write_tips_msg(); write_result_file(); update_field_index(); update_dateindex(); update_registry($docid_count); write_nmz_files(); make_slog_file(); remove_lockfile($var::NMZ{'lock'}); } make_headfoot_pages($docid_count, $key_count); put_log($total_files_size, $start_time, $docid_count, $key_count); util::remove_tmpfiles(); unlink $var::NMZ{'_flist'}; } sub make_headfoot_pages($$) { my ($docid_count, $key_count) = @_; for my $file (glob "$TEMPLATEDIR/NMZ.head*") { if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){ my $suffix = $1 ? $1 : ''; make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count); } } for my $file (glob "$TEMPLATEDIR/NMZ.foot*") { if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){ my $suffix = $1 ? $1 : ''; make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count); } } } # Parse command line options. sub parse_options { if (@ARGV == 0) { show_mini_usage(); exit 1; } my @targets = (); my $targets_loaded = 0; my @argv = @ARGV; my $cwd = cwd(); my $opt_dummy = 0; my $opt_version = 0; my $opt_help = 0; my $opt_all = 0; my $opt_chasen = 0; my $opt_chasen_noun = 0; my $opt_kakasi = 0; my $opt_mecab = 0; my $opt_checkpoint_sub = 0; my $opt_show_config = 0; my $opt_mailnews = 0; my $opt_mhonarc = 0; my $opt_norc = 0; my $opt_quiet = undef; my $opt_config = undef; my $output_dir = undef; my $update_index = undef; my $include_file = undef; my $target_list = undef; my $index_lang = undef; my %opt_conf; # Getopt::Long::Configure('bundling'); Getopt::Long::config('bundling'); GetOptions( '0|help' => \$opt_help, '1|exclude=s' => \$opt_conf{'EXCLUDE_PATH'}, '2|deny=s' => \$opt_conf{'DENY_FILE'}, '3|allow=s' => \$opt_conf{'ALLOW_FILE'}, '4|update=s' => \$update_index, '5|mhonarc' => \$opt_mhonarc, '6|mtime=s' => \$var::Opt{'mtime'}, '7|html-split' => \$var::Opt{'htmlsplit'}, 'C|show-config' => \$opt_show_config, 'E|no-edge-symbol' => \$var::Opt{'noedgesymbol'}, 'F|target-list=s' => \$target_list, 'G|no-okurigana' => \$var::Opt{'okurigana'}, 'H|no-hiragana' => \$var::Opt{'hiragana'}, 'I|include=s' => \$include_file, 'K|no-symbol' => \$var::Opt{'nosymbol'}, 'L|indexing-lang=s' => \$index_lang, 'M|meta' => \$var::Opt{'meta'}, 'O|output-dir=s' => \$output_dir, 'S|checkpoint-sub' => \$opt_checkpoint_sub, 'T|template-dir=s' => \$TEMPLATEDIR, 'U|no-encode-uri' => \$var::Opt{'noencodeuri'} , 'V|verbose' => \$var::Opt{'verbose'}, 'Y|no-delete' => \$var::Opt{'nodelete'}, 'Z|no-update' => \$var::Opt{'noupdate'}, 'a|all' => \$opt_all, 'b|use-mecab' => \$opt_mecab, 'c|use-chasen' => \$opt_chasen, 'd|debug' => \$var::Opt{'debug'}, 'e|robots' => \$var::Opt{'robotexclude'}, 'f|config=s' => \$opt_config, 'h|mailnews' => \$opt_mailnews, 'k|use-kakasi' => \$opt_kakasi, 'm|use-chasen-noun' => \$opt_chasen_noun, 'q|quiet' => \$opt_quiet, 'r|replace=s' => \$ReplaceCode, 's|checkpoint' => \$CheckPoint{'on'}, 't|media-type=s' => \$MediaType, 'u|uuencode' => \$opt_dummy, # for backward compat. 'v|version' => \$opt_version, 'x|no-heading-summary'=> \$var::Opt{'noheadabst'}, 'z|check-filesize' => \$var::Opt{'checkfilesize'}, 'decode-base64' => \$var::Opt{'decodebase64'}, 'norc' => \$opt_norc, ); if ($opt_quiet) { # Make STDOUT quiet by redirecting STDOUT to null device. my $devnull = util::devnull(); open(STDOUT, ">$devnull") || die "$devnull: $!"; } if (defined $update_index) { unless (-d $update_index) { print _("No such index: "), "$update_index\n"; exit 1; } my $orig_status = $var::NMZ{'status'}; $var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}"; my $argv = get_status("argv"); if (!defined $argv) { print _("No such index: "), "$update_index\n"; exit 1; } @ARGV = split /\t/, $argv; util::dprint(_("Inherited argv: ")."@ARGV\n"); my $cwd = get_status("cwd"); if (!defined $cwd) { print _("No such index: "), "$update_index\n"; exit 1; } chdir $cwd; util::dprint(_("Inherited cwd: ")."$cwd\n"); ($output_dir, @targets) = parse_options(); $output_dir = $update_index; $var::NMZ{'status'} = $orig_status; # See also change_filenames() return ($output_dir, @targets); } if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){ load_rcfiles(); } if ($opt_config) { if (-f $opt_config) { util::vprint(_("Reading rcfile: ")); load_rcfile($ConfigFile = $opt_config); util::vprint(" $opt_config\n"); } } if ($index_lang) { $util::LANG = $index_lang; util::dprint("Override indexing language: $util::LANG\n"); } if ($opt_help) { show_usage(); exit 1; } if ($opt_version) { show_version(); exit 1; } load_filtermodules(); # to make effect $opt_config, $index_lang. postload_modules(); foreach my $key (keys %opt_conf){ if (defined ($opt_conf{$key})) { ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key}; } } if ($opt_mailnews) { $MediaType = 'message/rfc822'; } if ($opt_mhonarc) { $MediaType = 'text/html; x-type=mhonarc'; } if ($opt_all) { $conf::ALLOW_FILE = ".*"; } if ($opt_chasen) { $conf::WAKATI = $conf::CHASEN; $var::Opt{'noun'} = 0; } if ($opt_chasen_noun) { $conf::WAKATI = $conf::CHASEN_NOUN; $var::Opt{'noun'} = 1; } if ($opt_kakasi) { $conf::WAKATI = $conf::KAKASI; $var::Opt{'noun'} = 0; } if ($opt_mecab) { $conf::WAKATI = $conf::MECAB; $var::Opt{'noun'} = 0; } if ($include_file) { do $include_file; util::dprint("Included: $include_file\n"); } if ($target_list) { if ($CheckPoint{'continue'}) { @targets = ("dummy"); } else { @targets = load_target_list($target_list); util::dprint(_("Loaded: ")."$target_list\n"); } $targets_loaded = 1; } if ($opt_checkpoint_sub) { $CheckPoint{'on'} = 1; $CheckPoint{'continue'} = 1; @argv = grep {! /^-S$/} @argv; # remove -S } if (defined $ReplaceCode) { my $orig = "/foo/bar/baz/quux.html"; $_ = $orig; eval $ReplaceCode; if ($@) { # eval error util::cdie(_("Invalid replace: ")."$ReplaceCode\n"); } util::dprint(_("Replace: ")."$orig -> $_\n"); } if ($opt_show_config) { show_config(); exit 1; } if (@ARGV == 0 && $targets_loaded == 0) { show_mini_usage(); exit 1; } $output_dir = $cwd unless defined $output_dir; util::cdie("$output_dir: "._("invalid output directory\n")) unless (-d $output_dir && -w $output_dir); if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { util::win32_yen_to_slash(\$output_dir); } # take remaining @ARGV as targets if (@ARGV > 0 && $targets_loaded == 0) { @targets = @ARGV ; } # revert @ARGV # unshift @ARGV, splice(@argv, 0, @argv - @ARGV); @ARGV = @argv; return ($output_dir, @targets); } sub show_config () { print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles; print _("System: ") . "$English::OSNAME\n" if $English::OSNAME; print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION; print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION); print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION; print _("NKF: ") . "$conf::NKF\n" if $conf::NKF; print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI; print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN; print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB; print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI; print _("Lang_Msg: ") . "$util::LANG_MSG\n"; print _("Lang: ") . "$util::LANG\n"; print _("Coding System: ") . "$CodingSystem\n"; print _("CONFDIR: ") . "$CONFDIR\n"; print _("LIBDIR: ") . "$LIBDIR\n"; print _("FILTERDIR: ") . "$FILTERDIR\n"; print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n"; my @all_types = keys %var::Supported; my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types; my $num_supported = @supported; my $num_unsupported = @all_types - @supported; print _("Supported media types: ") . "($num_supported)\n"; print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n"); for my $mtype (sort keys %var::Supported) { my $yn = $var::Supported{$mtype}; if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'}; print "$yn $mtype"; if ($var::REQUIRE_ACTIONS{$mtype}){ print ": $var::REQUIRE_ACTIONS{$mtype}.pl"; } print "\n"; } } sub load_target_list ($) { my ($file) = @_; my $fh_targets = util::efopen($file); my @targets = <$fh_targets>; util::fclose($fh_targets); if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { foreach my $tmp (@targets){ $tmp =~ s/\r//g; util::win32_yen_to_slash(\$tmp); } } chomp @targets; return @targets; } # convert a relative path into an absolute path sub absolute_path($$) { my ($cwd, $path) = @_; $path =~ s!^\.$!\./!; $path =~ s!^\.[/\\]!$cwd/!; if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { util::win32_yen_to_slash(\$path); if ($path =~ m!^//!) { } elsif ($path =~ m!^/[^/]!) { my $driveletter = $cwd; if ($driveletter =~ m!^([A-Z]:)!i){ $driveletter = $1; } $path = "$driveletter$path"; } elsif ($path !~ m!^[A-Z]:/!i) { $path = "$cwd/$path"; } } else { $path =~ s!^([^/])!$cwd/$1!; } return $path; } sub find_target (@) { my @targets = @_; my $cwd = cwd(); @targets = map { absolute_path($cwd, $_) } @targets; # Convert \ to / with consideration for Shift_JIS encoding. if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { foreach my $tmp (@targets){ util::win32_yen_to_slash(\$tmp); } } # For reporting effects of --allow, --deny, --exclude, --mtime # options in --verbose mode. my %counts = (); $counts{'possible'} = 0; $counts{'excluded'} = 0; $counts{'too_old'} = 0; $counts{'too_new'} = 0; $counts{'not_allowed'} = 0; $counts{'denied'} = 0; # Traverse directories. # This routine is not efficent but I prefer reliable logic. my @flist = (); my $start = time(); util::vprint(_("find_target starting: "). localtime($start). "\n"); while (@targets) { my $target = shift @targets; if ($target eq '') { print STDERR "Warning: target contains empty line, skip it\n"; next; } if (-f $target) { # target is a file. add_target($target, \@flist, \%counts); } elsif (-d $target) { # target is a directory. my @subtargets = (); # Find subdirectories in target directory # because File::Find::find() does not follow symlink. if (-l $target) { my $dh = new DirHandle($target); while (defined(my $ent = $dh->read)) { next if ($ent =~ /^\.{1,2}$/); if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { next if ($ent =~ m!^($conf::DENY_DDN)$!i); my $tmp = $ent; util::win32_yen_to_slash(\$tmp); next if ($ent ne $tmp); } my $fname = "$target/$ent"; next if ($fname eq '.' || $fname eq '..'); if (-d $fname) { push(@subtargets, $fname); } else { add_target($fname, \@flist, \%counts); } } } else { @subtargets = ($target); } # # Wanted routine for File::Find's find(). # my $wanted_closure = sub { my $fname = "$File::Find::dir/$_"; add_target($fname, \@flist, \%counts); }; find($wanted_closure, @subtargets) if (@subtargets > 0); } else { print STDERR _("unsupported target: ") . $target; } } # uniq @flist my %mark = (); @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist; # Sort file names with consideration for numbers. @flist = map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge; [ $_, $tmp ] } @flist; my $elapsed = time() - $start ; $elapsed += 1 ; # to round up and avoid 0 # For --verbose option. report_find_target($elapsed, $#flist + 1, %counts); return @flist; } sub add_target ($\@\%) { my ($target, $flists_ref, $counts_ref) = @_; if ($target =~ /[\n\r\t]/) { $target =~ s/[\n\r\t]//g; print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n"; return; # skip a file name containing LF/CR/TAB chars. } return unless -f $target; # Only file is targeted. $counts_ref->{'possible'}++; unless (util::canopen($target)) { util::vprint(sprintf(_("Unreadable: %s"), $target)); $counts_ref->{'excluded'}++; return; } if (defined $conf::EXCLUDE_PATH && $target =~ /$conf::EXCLUDE_PATH/ ) { util::vprint(sprintf(_("Excluded: %s"), $target)); $counts_ref->{'excluded'}++; return; } # # Do processing just like find's --mtime option. # if (defined $var::Opt{'mtime'}) { my $mtime = -M $_; if ($var::Opt{'mtime'} < 0) { # This must be `>=' not `>' for consistency with find(1). if (int($mtime) >= - $var::Opt{'mtime'}) { util::vprint(sprintf(_("Too old: %s"), $target)); $counts_ref->{'too_old'}++; return; } } elsif ($var::Opt{'mtime'} > 0) { if ($var::Opt{'mtime'} =~ /^\+/) { if ((int($mtime) < $var::Opt{'mtime'})) { util::vprint(sprintf(_("Too new: %s"), $target)); $counts_ref->{'too_new'}++; return; } } else { if (int($mtime) != $var::Opt{'mtime'}) { if (int($mtime) > $var::Opt{'mtime'}) { util::vprint(sprintf(_("Too old: %s"),$target)); $counts_ref->{'too_old'}++; } else { util::vprint(sprintf(_("Too new: %s"),$target)); $counts_ref->{'too_new'}++; } return; } } } else { # $var::Opt{'mtime'} == 0 ; return; } } # Extract the file name of the target. $target =~ m!^.*/([^/]+)$!; my $fname = $1; if ($fname =~ m!^($conf::DENY_FILE)$!i ) { util::vprint(sprintf(_("Denied: %s"), $target)); $counts_ref->{'denied'}++; return; } if ($fname !~ m!^($conf::ALLOW_FILE)$!i) { util::vprint(sprintf(_("Not allowed: %s"), $target)); $counts_ref->{'not_allowed'}++; return; } else{ util::vprint(sprintf(_("Targeted: %s"), $target)); push @$flists_ref, $target; } } sub report_find_target ($$%) { my ($elapsed, $num_targeted, %counts) = @_; util::vprint(_("find_target finished: ") . localtime(time()). "\n"); util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"), $num_targeted, $elapsed, $num_targeted /$elapsed)); util::vprint(sprintf(_(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"), $counts{'possible'}, $counts{'not_allowed'}, $counts{'denied'}, $counts{'excluded'})); util::vprint(sprintf(_(" MTIME too old: %d, MTIME too new: %d"), $counts{'too_old'}, $counts{'too_new'})); } sub show_usage () { util::dprint(_("lang_msg: ")."$util::LANG_MSG\n"); util::dprint(_("lang: ")."$util::LANG\n"); my $usage = $usage::USAGE; $usage = _($usage); printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS; } sub show_mini_usage () { print _("Usage: mknmz [options] ...\n"); print _("Try `mknmz --help' for more information.\n"); } sub show_version () { print $usage::VERSION_INFO; } # # check the file. No $msg is good. # sub check_file ($$$$$) { my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_; my $msg = undef; if ($mtype =~ /; x-system=unsupported$/) { $mtype =~ s/; x-system=unsupported$//; $msg = _("Unsupported media type ")."($mtype)"._(" skipped."); } elsif ($mtype =~ /; x-error=file_size_max/) { $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; } elsif ($mtype =~ /; x-error=.*$/) { $mtype =~ s/^.*; x-error=(.*)$/$1/; $msg = $mtype; } elsif ($mtype =~ /^x-system/) { $msg = _("system error occurred! ")."($mtype)"._(" skipped."); } elsif (! -e $cfile) { $msg = _("does NOT EXIST! skipped."); } elsif (! util::canopen($cfile)) { $msg = _("is NOT READABLE! skipped."); } elsif ($text_size == 0 || $cfile_size == 0) { $msg = _("is 0 size! skipped."); } elsif ($mtype =~ /^application\/octet-stream/) { $msg = _("may be a BINARY file! skipped."); } elsif ($cfile_size > $conf::FILE_SIZE_MAX) { $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; } elsif ($text_size > $conf::TEXT_SIZE_MAX) { $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ; } return $msg; } # # Write NMZ.version file. # sub write_version() { unless (-f $var::NMZ{'version'}) { my $fh = util::efopen(">$var::NMZ{'version'}"); print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n"; util::fclose($fh); } } # # rename each temporary file to a real file name. # sub write_nmz_files () { util::Rename($var::NMZ{'_i'}, $var::NMZ{'i'}); util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'}); util::Rename($var::NMZ{'_w'}, $var::NMZ{'w'}); util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'}); util::Rename($var::NMZ{'_p'}, $var::NMZ{'p'}); util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'}); } # output NMZ.body sub write_body_msg () { for my $file (glob "$TEMPLATEDIR/NMZ.body*") { if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){ my $suffix = $1 ? $1 : ''; write_message("$var::NMZ{'body'}${suffix}"); } } } # output NMZ.tips sub write_tips_msg () { for my $file (glob "$TEMPLATEDIR/NMZ.tips*") { if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){ my $suffix = $1 ? $1 : ''; write_message("$var::NMZ{'tips'}${suffix}"); } } } # output NMZ.result.* sub write_result_file () { my $fname = "NMZ.result.normal"; my @files = glob "$TEMPLATEDIR/NMZ.result.*"; for my $file (@files) { $file =~ m!(NMZ\.result\.[^/]*)$!; my $target = "$var::OUTPUT_DIR/$1"; if (-f $target) { # already exist; next; } else { my $buf = util::readfile($file); my $fh_file = util::efopen(">$target"); print $fh_file $buf; util::fclose($fh_file); } } } # write NMZ.body and etc. sub write_message ($) { my ($msgfile) = @_; if (! -f $msgfile) { my ($template, $fname); $msgfile =~ m!.*/(.*)$!; $fname = $1; $template = "$TEMPLATEDIR/$fname"; if (-f $template) { my $buf = util::readfile($template); my $fh_output = util::efopen(">$msgfile"); print $fh_output $buf; util::fclose($fh_output); } } } # # Make the NMZ.slog file for logging. # sub make_slog_file () { if (! -f $var::NMZ{'slog'}) { my $fh = util::efopen(">$var::NMZ{'slog'}"); util::fclose($fh); undef $fh; chmod 0666, $var::NMZ{'slog'}; } { my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}"); util::fclose($fh_slogfile); } } # # Concatenate $CURRENTDIR to the head of each file. # sub change_filenames ($) { my $dir = $var::OUTPUT_DIR; for my $key (sort keys %var::NMZ) { next if $key =~ /^_/; # exclude temporary file $var::NMZ{$key} = "$dir/$var::NMZ{$key}"; } # temporary files for my $key (sort keys %var::NMZ) { if ($key =~ /^_/) { $var::NMZ{$key} = util::tmpnam($var::NMZ{$key}); } } if ($var::Opt{'debug'}) { for my $key (sort keys %var::NMZ) { util::dprint("NMZ: $var::NMZ{$key}\n"); } } } # # Preparation processing for appending index files. # sub append_index (@) { my @flist = @_; my $docid_base = 0; ($docid_base, @flist) = set_target_files(@flist); unless (@flist) { # if @flist is empty if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) { set_lockfile($var::NMZ{'lock2'}); update_dateindex(); update_registry(0); make_headfoot_pages(0, get_total_keys()); put_log(0, 0, 0, get_total_keys()); make_headfoot_pages(get_status("files"), get_status("keys")); util::remove_tmpfiles(); } print _("No files to index.\n"); exit 0; } $APPENDMODE = 1; # conserve files by copying copy($var::NMZ{'i'}, $var::NMZ{'_i'}); copy($var::NMZ{'w'}, $var::NMZ{'_w'}); copy($var::NMZ{'t'}, $var::NMZ{'_t'}) unless -f $var::NMZ{'_t'}; # preupdated ? copy($var::NMZ{'p'}, $var::NMZ{'_p'}); copy($var::NMZ{'pi'}, $var::NMZ{'_pi'}); return ($docid_base, @flist); } # # Set target files to @flist and return with the number of regiested files. # sub set_target_files() { my %rdocs; # 'rdocs' means 'registered documents' my @found_files = @_; # Load the list of registered documents $rdocs{'name'} = load_registry(); # Pick up overlapped documents and do marking my %mark1; my @overlapped_files; grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}}; $rdocs{'overlapped'} = {}; # Prepare an anonymous hash. for my $overlapped (grep { $mark1{$_} } @found_files) { $rdocs{'overlapped'}{$overlapped} = 1; push @overlapped_files, $overlapped; }; # Pick up not overlapped documents which are files to index. my @flist = grep { ! $mark1{$_} } @found_files; if ($var::Opt{'noupdate'}) { return (scalar @{$rdocs{'name'}}, @flist); }; # Load the date index. $rdocs{'mtime'} = load_dateindex(); if (@{$rdocs{'mtime'}} == 0) { return (scalar @{$rdocs{'name'}}, @flist); }; util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}}, "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!"); # Pick up deleted documents and do marking # (registered in the NMZ.r but not existent in the filesystem) my @deleted_documents; unless ($var::Opt{'nodelete'}) { my %mark2; grep { $mark2{$_}++ } @found_files; for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} && ! $rdocs{'overlapped'}{$_} } @{$rdocs{'name'}}) { $rdocs{'deleted'}{$deleted} = 1; push @deleted_documents, $deleted; } } # check filesize if ($var::Opt{'checkfilesize'}) { $rdocs{'size'} = load_sizefield(); } # Pick up updated documents and set the missing number for deleted files. my @updated_documents = pickup_updated_documents(\%rdocs); # Append updated files to the list of files to index. if (@updated_documents) { push @flist, @updated_documents; } # Remove duplicates. my %seen = (); @flist = grep { ! $seen{$_}++ } @flist; util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n"); util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n"); util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n"); util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n"); util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n"); util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n"); # Update NMZ.t with the missing number infomation and # append updated files and deleted files to NMZ.r with leading '# ' if (@updated_documents || @deleted_documents) { $DeletedFilesCount = 0; $UpdatedFilesCount = 0; $UpdatedFilesCount += @updated_documents; # $DeletedFilesCount += @updated_documents; $DeletedFilesCount += @deleted_documents; preupdate_dateindex(@{$rdocs{'mtime'}}); preupdate_registry(@updated_documents, @deleted_documents); } # Return the number of registered documents and list of files to index. return (scalar @{$rdocs{'name'}}, @flist); } sub preupdate_registry(@) { my (@list) = @_; my $fh_registry = util::efopen(">$var::NMZ{'_r'}"); @list = grep { s/(.*)/\# $1\n/ } @list; print $fh_registry @list; print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n"; util::fclose($fh_registry); } sub preupdate_dateindex(@) { my @mtimes = @_; # Since rewriting the entire file, it is not efficient, # but simple and reliable. this would be revised in the future. my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}"); # print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n"; print $fh_dateindex pack("N*", @mtimes); util::fclose($fh_dateindex); } sub update_registry ($) { my ($docid_count) = @_; { my $fh_registry = util::efopen(">>$var::NMZ{'r'}"); my $fh_registry_ = util::efopen($var::NMZ{'_r'}); while (defined(my $line = <$fh_registry_>)) { print $fh_registry $line; } if ($docid_count > 0) { print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n"; } util::fclose($fh_registry_) if (defined $fh_registry_); util::fclose($fh_registry); } unlink $var::NMZ{'_r'}; } sub update_dateindex () { util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'}); } sub update_field_index () { my @list = glob "$var::NMZ{'field'}.*.tmp"; for my $tmp (@list) { if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) { my $fname_tmp = $1; my $fname_out = $2; { my $fh_field = util::efopen(">>$fname_out"); my $fh_tmp = util::efopen($fname_tmp); while (defined(my $line = <$fh_tmp>)) { print $fh_field $line; } util::fclose($fh_tmp) if (defined $fh_tmp); util::fclose($fh_field); } unlink $fname_tmp; } else { util::cdie(_("update_field_index: ")."@list"); } } } sub pickup_updated_documents (\%) { my ($rdocs_ref) = @_; my @updated_documents = (); # To avoid duplicated outputs caused by --html-split support. my %printed = (); my $i = 0; for my $cfile (@{$rdocs_ref->{'name'}}) { if (defined($rdocs_ref->{'deleted'}{$cfile})) { unless ($printed{$cfile}) { print "$cfile " . _("was deleted!\n"); $printed{$cfile} = 1; } $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number. } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) { my $cfile_mtime = (stat($cfile))[9]; my $rfile_mtime = $rdocs_ref->{'mtime'}[$i]; my ($cfile_size, $rfile_size); if ($var::Opt{'checkfilesize'}) { $cfile_size = (stat($cfile))[7]; $rfile_size = $rdocs_ref->{'size'}[$i]; } if ($rfile_mtime != $cfile_mtime || ($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) { # The file is updated! unless ($printed{$cfile}) { print "$cfile " . _("was updated!\n"); $printed{$cfile} = 1; } push(@updated_documents, $cfile); $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number. } } $i++; } return @updated_documents } sub load_dateindex() { my $fh_dateindex = util::efopen($var::NMZ{'t'}); my $size = -s $var::NMZ{'t'}; my $buf = ""; read($fh_dateindex, $buf, $size); my @list = unpack("N*", $buf); # load date index # print "\nload_dateindex\n", join("\n", @list), "\n\n"; util::fclose($fh_dateindex); return [ @list ]; } sub load_registry () { my $fh_registry = util::efopen($var::NMZ{'r'}); my @list = (); my %deleted = (); my @registered = (); while (defined(my $line = <$fh_registry>)) { chomp($line); next if $line =~ /^\s*$/; # an empty line next if $line =~ /^##/; # a comment if ($line =~ s/^\#\s+//) { # deleted document $deleted{$line}++; } else { # Remove HTML's anchor generated by --html-split option. $line =~ s/\t.*$//g; push @registered, $line; } } util::fclose($fh_registry) if (defined $fh_registry); # Exclude deleted documents. for my $doc (@registered) { if ($deleted{$doc}) { push @list, "# $doc"; $deleted{$doc}--; } else { push @list, $doc; } } return [ @list ]; } # get file size information from NMZ.field.size sub load_sizefield() { my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size'); return [] unless defined $fh_sizefield; my $line; my @ret = (); while (defined($line = <$fh_sizefield>)) { chomp $line; push @ret, $line; } util::fclose($fh_sizefield) if (defined $fh_sizefield); return \@ret; } sub get_total_keys() { my $keys = get_status("keys"); $keys =~ s/,//g if (defined $keys); $keys = 0 unless defined $keys; return $keys; } sub get_total_files() { my $files = get_status("files"); $files =~ s/,//g if (defined $files); $files = 0 unless defined $files; return $files; } sub get_status($) { my ($key) = @_; my $fh = util::fopen($var::NMZ{'status'}); return undef unless defined $fh; while (defined(my $line = <$fh>)) { if ($line =~ /^$key\s+(.*)$/) { util::dprint("status: $key = $1\n"); $fh->close; return $1; } } util::fclose($fh) if (defined $fh); return undef; } sub put_total_files($) { my ($number) = @_; $number =~ tr/,//d; put_status("files", $number); } sub put_total_keys($) { my ($number) = @_; $number =~ tr/,//d; put_status("keys", $number); } sub put_status($$) { my ($key, $value) = @_; # remove NMZ.status file if the file has a previous value. unlink $var::NMZ{'status'} if defined get_status($key); my $fh = util::efopen(">> $var::NMZ{'status'}"); print $fh "$key $value\n"; util::fclose($fh); } # do logging sub put_log ($$$$) { my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_; my $date = localtime; my $added_files_count = $docid_count; my $deleted_documents_count = $DeletedFilesCount; my $updated_documents_count = $UpdatedFilesCount; my $total_files_count = get_total_files() + $docid_count - $DeletedFilesCount - $UpdatedFilesCount; my $added_keys_count = 0; $added_keys_count = $total_keys_count - get_total_keys(); my $processtime = time - $start_time; $processtime = 0 if $start_time == 0; $total_files_size = $total_files_size; $total_keys_count = $total_keys_count; my @logmsgs = (); if ($APPENDMODE) { push @logmsgs, N_("[Append]"); } else { push @logmsgs, N_("[Base]"); } push @logmsgs, N_("Date:"), "$date" if $date; push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count") if $added_files_count; push @logmsgs, N_("Deleted Documents:"), util::commas("$deleted_documents_count") if $deleted_documents_count; push @logmsgs, N_("Updated Documents:"), util::commas("$updated_documents_count") if $updated_documents_count; push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size") if $total_files_size; push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count") if $total_files_count; push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count") if $added_keys_count; push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count") if $total_keys_count; push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI; push @logmsgs, N_("Time (sec):"), util::commas("$processtime") if $processtime; push @logmsgs, N_("File/Sec:"), sprintf "%.2f", (($added_files_count + $updated_documents_count) / $processtime) if $processtime; push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME; push @logmsgs, N_("Perl:"), sprintf("%f", $English::PERL_VERSION); push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION; my $log_for_file = ""; my $msg = shift @logmsgs; # [Base] or [Append] # To stdout, use gettext. print _($msg), "\n"; # To log file, do not use gettext. $log_for_file = $msg . "\n"; while (@logmsgs) { my $field = shift @logmsgs; my $value = shift @logmsgs; printf "%-20s %s\n", _($field), "$value"; $log_for_file .= sprintf "%-20s %s\n", $field, "$value"; } print "\n"; $log_for_file .= "\n"; put_log_to_logfile($log_for_file); put_total_files($total_files_count); put_total_keys($total_keys_count); my $argv = join "\t", @ARGV; my $cwd = cwd(); put_status("argv", $argv); put_status("cwd", $cwd); } sub put_log_to_logfile ($) { my ($logmsg) = @_; my $fh_logfile = util::efopen(">>$var::NMZ{'log'}"); print $fh_logfile $logmsg; util::fclose($fh_logfile); } sub get_year() { my $year = (localtime)[5] + 1900; return $year; } # Compose NMZ.head and NMZ.foot. Prepare samples if necessary. # Insert $docid_count, $key_count, and $month/$day/$year respectively. sub make_headfoot ($$$) { my ($file, $docid_count, $key_count) = @_; my $day = sprintf("%02d", (localtime)[3]); my $month = sprintf("%02d", (localtime)[4] + 1); my $year = get_year(); my $buf = ""; if (-f $file) { $buf = util::readfile($file); } else { $file =~ m!.*/(.*)$!; my $fname = $1; my $template = "$TEMPLATEDIR/$fname"; if (-f $template) { $buf = util::readfile($template); } else { return; } } my $fh_file = util::efopen(">$file"); if ($buf =~ /()\s*(.*)\s*()/) { my $total_files_count = util::commas(get_total_files() + $docid_count - $DeletedFilesCount - $UpdatedFilesCount); $buf =~ s/()(.*)()/$1 $total_files_count $3/; } if ($buf =~ /()\s*(.*)\s*()/) { my $tmp = $2; $tmp =~ tr/,//d; $tmp = $key_count; $tmp = util::commas($tmp); $buf =~ s/()(.*)()/$1 $tmp $3/; } my $index_dir = basename($var::OUTPUT_DIR); $buf =~ s##$index_dir#gs; $buf =~ s#()(.*)()#$1 $year-$month-$day $3#gs; $buf =~ s/()(.*)()/$1 v$var::VERSION $3/gs; $buf =~ s{()(.*)()} {$1\n$conf::ADDRESS\n$3}gs; $buf =~ s{()(.*)()} {$1\n\n$3}gs; print $fh_file $buf; util::fclose($fh_file); } # Make phrase hashes for NMZ.p # Handle two words each for calculating a hash value ranged 0-65535. sub make_phrase_hash ($$$) { my ($docid_count, $docid_base, $contref) = @_; my %tmp = (); $$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols my @words = split(/\s+/, $$contref); @words = grep {$_ ne ""} @words; # remove empty words my $word_b = shift @words; my $docid = $docid_count + $docid_base; for my $word (@words) { next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); my $hash = hash($word_b . $word); unless (defined $tmp{$hash}) { $tmp{$hash} = 1; $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash}; $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash}); # util::dprint("<$word_b, $word> $hash\n"); $PhraseHashLast{$hash} = $docid; } $word_b = $word; } } # Construct NMZ.p and NMZ.pi file. this processing is rather complex. sub write_phrase_hash () { write_phrase_hash_sub(); util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'}); util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'}); } sub write_phrase_hash_sub () { my $opened = 0; return 0 if %PhraseHash eq ''; # namazu-devel-ja #3146 util::dprint(_("doing write_phrase_hash() processing.\n")); my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}"); my $fh_tmp_p = util::efopen(">$var::NMZ{'__p'}"); my $fh_phrase = util::fopen($var::NMZ{'_p'}); my $fh_phraseindex = undef; if ($fh_phrase) { $fh_phraseindex = util::efopen($var::NMZ{'_pi'}); $opened = 1; } my $ptr = 0; for (my $i = 0; $i < 65536; $i++) { my $baserecord = ""; my $baseleng = 0; if ($opened) { my $tmp = 0; read($fh_phraseindex, $tmp, $var::INTSIZE); $tmp = unpack("N", $tmp); if ($tmp != 0xffffffff) { # 0xffffffff $baseleng = readw($fh_phrase); read($fh_phrase, $baserecord, $baseleng); } } if (defined($PhraseHash{$i})) { if ($baserecord eq "") { print $fh_tmp_pi pack("N", $ptr); my $record = $PhraseHash{$i}; my $n2 = length($record); my $data = pack("w", $n2) . $record; print $fh_tmp_p $data; $ptr += length($data); } else { print $fh_tmp_pi pack("N", $ptr); my $record = $PhraseHash{$i}; my $last_docid = get_last_docid($baserecord, 1); my $adjrecord = adjust_first_docid($record, $last_docid); check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31 $record = $adjrecord; my $n2 = length($record) + $baseleng; my $data = pack("w", $n2) . $baserecord . $record; print $fh_tmp_p $data; $ptr += length($data); } } else { if ($baserecord eq "") { # if $baserecord has no data, set to 0xffffffff print $fh_tmp_pi pack("N", 0xffffffff); } else { print $fh_tmp_pi pack("N", $ptr); my $data = pack("w", $baseleng) . $baserecord; print $fh_tmp_p $data; $ptr += length($data); } } } if ($opened) { util::fclose($fh_phraseindex); } if (defined $fh_phrase) { util::fclose($fh_phrase); } util::fclose($fh_tmp_p); util::fclose($fh_tmp_pi); %PhraseHash = (); %PhraseHashLast = (); } # Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998) sub hash ($) { my ($word) = @_; my $hash = 0; for (my $i = 0; $word ne ""; $i++) { $hash ^= $Seed[$i & 0x03][ord($word)]; $word = substr $word, 1; # $word =~ s/^.//; is slower } return $hash & 65535; } # Count frequencies of words. sub count_words ($$$$) { my ($docid_count, $docid_base, $contref, $kanji) = @_; my (@tmp); # Normalize into small letter. $$contref =~ tr/A-Z/a-z/; # Remove control char. $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /; # It corresponds to -j option of ChaSen. $$contref =~ s/^[ \t\f]+//gm; # except "\r\n" $$contref =~ s/[ \t\f]+$//gm; # except "\r\n" $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi; # for hyphenation if (util::islang("ja")) { $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs; $$contref =~ s/(¡£|¡¢)/$1\n/gs; } $$contref =~ s/\n+/\n/gs; # Do wakatigaki if necessary. if (util::islang("ja")) { wakati::wakatize_japanese($contref) if $kanji; } my $part1 = ""; my $part2 = ""; if ($$contref =~ /\x7f/) { $part1 = substr $$contref, 0, index($$contref, "\x7f"); $part2 = substr $$contref, index($$contref, "\x7f"); # $part1 = $PREMATCH; # $& and friends are not efficient # $part2 = $MATCH . $POSTMATCH; } else { $part1 = $$contref; $part2 = ""; } # do scoring my %word_count = (); $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f! wordcount_sub($2, $1, \%word_count)!ge; wordcount_sub($part1, 1, \%word_count); # Add them to whole index my $docid = $docid_count + $docid_base; for my $word (keys(%word_count)) { next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word}; $KeyIndex{$word} .= pack("w2", $docid - $KeyIndexLast{$word}, $word_count{$word}); $KeyIndexLast{$word} = $docid; } } # # Count words and do score weighting # sub wordcount_sub ($$\%) { my ($text, $weight, $word_count) = @_; # Remove all symbols when -K option is specified. $text =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'}; # Count frequencies of words in a current document. # Handle symbols as follows. # # tcp/ip -> tcp/ip, tcp, ip # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp # # Don't do processing for nested symbols. # NOTE: When -K is specified, all symbols are already removed. my @words = split /\s+/, $text; for my $word (@words) { next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX); if ($var::Opt{'noedgesymbol'}) { # remove symbols at both ends $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g; } $word_count->{$word} = 0 unless defined($word_count->{$word}); $word_count->{$word} += $weight; unless ($var::Opt{'nosymbol'}) { if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) { $word_count->{$1} = 0 unless defined($word_count->{$1}); $word_count->{$1} += $weight; next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) { $word_count->{$1} = 0 unless defined($word_count->{$1}); $word_count->{$1} += $weight; next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) { $word_count->{$1} = 0 unless defined($word_count->{$1}); $word_count->{$1} += $weight; next unless $1 =~ /[^\xa1-\xfea-z_0-9]/; } my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word) if $word =~ /[^\xa1-\xfea-z_0-9]/; for my $tmp (@words_) { next if $tmp eq ""; $word_count->{$tmp} = 0 unless defined($word_count->{$tmp}); $word_count->{$tmp} += $weight; } @words_ = (); } } return ""; } # Construct NMZ.i and NMZ.ii file. this processing is rather complex. sub write_index () { my $key_count = write_index_sub(); util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'}); util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'}); return $key_count; } # readw: read one pack 'w' word. # This code was contributed by . sub readw ($) { my $fh = shift; my $ret = ''; my $c; while (read($fh, $c, 1)){ $ret .= $c; last unless 0x80 & ord $c; } return unpack('w', $ret); } sub get_last_docid ($$) { my ($record, $step) = @_; my (@data) = unpack 'w*', $record; my $sum = 0; for (my $i = 0; $i < @data; $i += $step) { $sum += $data[$i]; } my $leng = @data / $step; return $sum; } sub adjust_first_docid ($$) { my ($record, $last_docid) = @_; my (@data) = unpack 'w*', $record; $data[0] = $data[0] - $last_docid; return undef if ($data[0] < 0); # namazu-bug-ja#31 $record = pack 'w*', @data; return $record; } sub write_index_sub () { my @words = sort keys(%KeyIndex); return 0 if $#words == -1; my $cnt = 0; my $ptr_i = 0; my $ptr_w = 0; my $key_count = 0; my $baserecord = ""; util::dprint(_("doing write_index() processing.\n")); my $fh_tmp_i = util::efopen(">$var::NMZ{'__i'}"); my $fh_tmp_w = util::efopen(">$var::NMZ{'__w'}"); my $fh_i = util::fopen($var::NMZ{'_i'}); my $fh_ii = util::efopen(">$var::NMZ{'_ii'}"); my $fh_w = util::fopen($var::NMZ{'_w'}); my $fh_wi = util::efopen(">$var::NMZ{'_wi'}"); if ($fh_w) { FOO: while (defined(my $line = <$fh_w>)) { chop $line; my $current_word = $line; my $baseleng = readw($fh_i); read($fh_i, $baserecord, $baseleng); for (; $cnt < @words; $cnt++) { last unless $words[$cnt] le $current_word; my $record = $KeyIndex{$words[$cnt]}; my $leng = length($record); if ($current_word eq $words[$cnt]) { my $last_docid = get_last_docid($baserecord, 2); my $adjrecord = adjust_first_docid($record, $last_docid); check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31 $record = $adjrecord; $leng = length($record); # re-measure my $tmp = pack("w", $leng + $baseleng); my $data_i = "$tmp$baserecord$record"; my $data_w = "$current_word\n"; print $fh_tmp_i $data_i; print $fh_tmp_w $data_w; print $fh_ii pack("N", $ptr_i); print $fh_wi pack("N", $ptr_w); $ptr_i += length($data_i); $ptr_w += length($data_w); $key_count++; $cnt++; next FOO; } else { my $tmp = pack("w", $leng); my $data_i = "$tmp$record"; my $data_w = "$words[$cnt]\n"; print $fh_tmp_i $data_i; print $fh_tmp_w $data_w; print $fh_ii pack("N", $ptr_i); print $fh_wi pack("N", $ptr_w); $ptr_i += length($data_i); $ptr_w += length($data_w); $key_count++; } } my $tmp = pack("w", $baseleng); my $data_i = "$tmp$baserecord"; my $data_w = "$current_word\n"; print $fh_tmp_i $data_i; print $fh_tmp_w $data_w; print $fh_ii pack("N", $ptr_i); print $fh_wi pack("N", $ptr_w); $ptr_i += length($data_i); $ptr_w += length($data_w); $key_count++; } } while ($cnt < @words) { my $leng = length($KeyIndex{$words[$cnt]}); my $tmp = pack("w", $leng); my $record = $KeyIndex{$words[$cnt]}; my $data_i = "$tmp$record"; my $data_w = "$words[$cnt]\n"; print $fh_tmp_i $data_i; print $fh_tmp_w $data_w; print $fh_ii pack("N", $ptr_i); print $fh_wi pack("N", $ptr_w); $ptr_i += length($data_i); $ptr_w += length($data_w); $key_count++; $cnt++; } %KeyIndex = (); %KeyIndexLast = (); util::fclose($fh_wi); util::fclose($fh_w) if (defined $fh_w); util::fclose($fh_ii); util::fclose($fh_i) if (defined $fh_i); util::fclose($fh_tmp_w); util::fclose($fh_tmp_i); return $key_count; } # # Decide the media type. # FIXME: Very ad hoc. It's just a compromise. -- satoru # sub decide_type ($$) { my ($name, $cont) = @_; return $name if (!defined $cont || $name eq $cont); util::dprint("decide_type: name: $name, cont: $cont\n"); if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) { return $name; } elsif ($cont =~ m!^application/octet-stream! && $name !~ m!^text/!) { return $name; } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! && $name !~ m!^application/octet-stream!) { # FIXME: Currently File::MMagic 1.02's checktype_data() # is unreliable for them. return $name; } elsif ($cont =~ m!^application/x-zip! && $name =~ m!^application/!) { # zip format is used other applications e.g. OpenOffice. # It is necessary to add to check extention. return $name; } return $cont; } # # Debugging code for the "negative numbers" problem. # sub check_records ($$$) { my ($recref, $baserecref, $step) = @_; dump_record($baserecref, $step); dump_record($recref, $step); print STDERR "The \x22negative number\x22 problem occurred.\n"; exit(1); } sub dump_record($$) { my ($recref, $step) = @_; my (@data) = unpack 'w*', $$recref; print STDERR "dump record data to NMZ.bug.info (step: $step)..."; my $fh_info = util::fopen(">> NMZ.bug.info"); print $fh_info "dumped record data (step: $step)..."; foreach (@data) { print $fh_info sprintf(" %08x", $_); } print $fh_info "\n"; util::fclose($fh_info); return; } sub trapintr { my ($signame) = @_; print STDERR "Warning: signal $signame occured.\n"; } # # For avoiding "used only once: possible typo at ..." warnings. # muda($conf::ON_MEMORY_MAX, $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX, $conf::DENY_FILE, $var::INTSIZE, $conf::CHASEN_NOUN, $conf::CHASEN, $conf::KAKASI, $var::Opt{'okurigana'}, $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX, $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO, $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX, $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE, $conf::ADDRESS, $var::MAILING_ADDRESS, $conf::FILE_SIZE_MAX, $conf::MECAB, $conf::DENY_DDN, $var::TRAC_URI, ); sub muda {}