3 # mknmz - indexer of Namazu
4 # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
6 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
7 # Copyright (C) 2000-2008 Namazu Project All rights reserved.
8 # This is free software with ABSOLUTELY NO WARRANTY.
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either versions 2, or (at your option)
15 # This program is distributed in the hope that it will be useful
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, write to the Free Software
22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25 # This file must be encoded in EUC-JP encoding
37 use strict
; # be strict since v1.2.0
44 # It exists only for back compatibility.
45 $SYSTEM = $English::OSNAME;
47 my $NAMAZU_INDEX_VERSION = "2.0";
49 my $CodingSystem = "euc";
50 my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu";
51 my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in.
52 my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
53 my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
54 my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
56 my $DeletedFilesCount = 0;
57 my $UpdatedFilesCount = 0;
60 my %PhraseHashLast = ();
62 my %KeyIndexLast = ();
63 my %CheckPoint = ("on" => undef, "continue" => undef);
64 my $ConfigFile = undef;
65 my $MediaType = undef;
67 my $ReplaceCode = undef; # perl code for transforming URI
69 my @LoadedRcfiles = ();
70 my $Magic = new File::MMagic;
78 my $start_time = time;
80 if ($English::PERL_VERSION == 5.008001) {
81 unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
82 print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
89 # At first, loading pl/conf.pl to prevent overriding some variables.
92 # set LANG and bind textdomain
94 textdomain('namazu', $util::LANG_MSG);
97 my ($output_dir, @targets) = parse_options();
98 my ($docid_base, $total_files_num) = prep($output_dir, @targets);
103 my $total_files_size = 0;
107 my $processed_files_size = 0;
109 if ($CheckPoint{'continue'}) {
111 eval util::readfile($var::NMZ{'_checkpoint'}) ;
113 print $total_files_num . _(" files are found to be indexed.\n");
117 my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
118 my $fh_flist = util::efopen($var::NMZ{'_flist'});
119 my %field_indices = ();
120 get_field_index_base(\%field_indices);
122 if ($CheckPoint{'continue'}) {
123 seek($fh_flist, $flist_ptr, 0);
126 # Process target files one by one
127 while (defined(my $line = <$fh_flist>)) {
128 $flist_ptr += length($line);
131 util::dprint(_("target file: ")."$cfile\n");
133 my ($cfile_size, $num) =
134 process_file($cfile, $docid_count, $docid_base,
135 $file_count, \%field_indices,
136 $fh_errorsfile, $total_files_num);
141 $docid_count += $num;
145 $total_files_size += $cfile_size;
146 $processed_files_size += $cfile_size;
147 last if $ReceiveTERM;
148 if ($processed_files_size > $conf::ON_MEMORY_MAX) {
150 $key_count = write_index();
151 print _("Writing index files...");
155 $processed_files_size = 0;
156 $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
160 util::fclose($fh_flist);
161 util::fclose($fh_errorsfile);
163 # This should be out of above blocks because of file handler closing.
164 re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
165 $total_files_size, $total_files_num,
166 $file_count, $key_count) if $checkpoint;
169 $key_count = write_index();
170 print _("Writing index files...");
175 $key_count = get_total_keys() unless $key_count;
176 do_remain_job($total_files_size, $docid_count, $key_count,
182 # FIXME: Very complicated.
184 sub process_file ($$$$\%$$) {
185 my ($cfile, $docid_count, $docid_base, $file_count,
186 $field_indices, $fh_errorsfile, $total_files_num) = @_;
188 my $processed_num = 0;
189 my $file_size = util::filesize($cfile);
191 if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
193 @parts = htmlsplit::split($cfile, "NMZ.partial")
194 if ($file_size <= $conf::FILE_SIZE_MAX);
197 for my $part (@parts) {
198 next if (defined $conf::EXCLUDE_PATH &&
199 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
200 my $fname = util::tmpnam("NMZ.partial.$id");
201 my $fragment = defined $part ? $part : undef;
202 my $uri = generate_uri($cfile, $fragment);
203 my $result = namazu_core($fname,
204 $docid_count + $processed_num,
205 $docid_base, $file_count,
206 $field_indices, $fh_errorsfile,
211 my $rname = defined $part ? "$cfile\t$part" : "$cfile";
212 put_registry($rname);
217 return ($file_size, $processed_num);
220 my $result = namazu_core($cfile, $docid_count, $docid_base,
221 $file_count, $field_indices,
222 $fh_errorsfile, $total_files_num,
223 undef, undef, undef);
226 put_registry($cfile);
229 return ($file_size, $processed_num);
235 # 1. MKNMZRC environment
237 # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc
241 # 4. user-specified mknmzrc set by mknmz --config=file option.
243 # If multiple files exists, read all of them.
245 sub load_rcfiles () {
248 # To support Windows. Since they have nasty drive letter convention,
249 # it is necessary to change mknmzrc dynamically with env. variable.
250 push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
251 push @cand, "$CONFDIR/mknmzrc";
252 push @cand, "$ENV{'HOME'}/.mknmzrc";
254 util::vprint(_("Reading rcfile: "));
255 for my $rcfile (@cand) {
257 load_rcfile ($rcfile);
258 util::vprint(" $rcfile");
264 sub load_rcfile ($) {
266 if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
267 util::win32_yen_to_slash(\$rcfile);
269 return if (grep {m/^$rcfile$/} @LoadedRcfiles);
273 push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
275 push @LoadedRcfiles, $rcfile;
279 $LIBDIR = $conf::LIBDIR
280 if (defined $conf::LIBDIR && -d $conf::LIBDIR);
281 $FILTERDIR = $conf::FILTERDIR
282 if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
283 $TEMPLATEDIR = $conf::TEMPLATEDIR
284 if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
287 sub re_exec($$$$$$$$) {
288 my ($flist_ptr, $docid_count, $docid_base, $start_time,
289 $total_files_size, $total_files_num, $file_count, $key_count) = @_;
293 my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
295 print $fh_checkpoint <<EOM;
296 \$DeletedFilesCount = $DeletedFilesCount;
297 \$UpdatedFilesCount = $UpdatedFilesCount;
298 \$APPENDMODE = $APPENDMODE;
299 \$flist_ptr = $flist_ptr;
300 \$docid_count = $docid_count;
301 \$docid_base = $docid_base;
302 \$start_time = $start_time;
303 \$total_files_size = $total_files_size;
304 \$total_files_num = $total_files_num;
305 \$key_count = $key_count;
306 \$file_count = $file_count;
309 util
::fclose
($fh_checkpoint);
312 @ARGV = ("-S", @ARGV) ;
313 print _
("Checkpoint reached: re-exec mknmz...\n");
314 util
::dprint
(join ' ', ("::::", @ARGV, "\n"));
318 sub put_registry
($) {
320 my $fh_registry = util
::efopen
(">>$var::NMZ{'_r'}");
321 print $fh_registry $filename, "\n";
322 util
::fclose
($fh_registry);
327 # $CodingSystem: Character Coding System 'euc' or 'sjis'
329 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
330 $CodingSystem = "sjis";
331 if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m
#^([A-Z]:)(/|\\)#i) {
332 $CONFDIR = $1 . $CONFDIR ;
334 if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m
#^([A-Z]:)(/|\\)#i) {
335 $LIBDIR = $1 . $LIBDIR ;
337 if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m
#^([A-Z]:)(/|\\)#i) {
338 $FILTERDIR = $1 . $FILTERDIR ;
340 if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m
#^([A-Z]:)(/|\\)#i) {
341 $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
344 $CodingSystem = "euc";
348 util
::cdie
("SIGINT caught! Aborted.\n");
352 print STDERR
"SIGTERM caught!\n";
357 sub preload_modules
() {
358 unshift @INC, $LIBDIR;
359 # workaround for test suites.
360 unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
362 require "var.pl" || die "unable to require \"var.pl\"\n";
363 require "conf.pl" || die "unable to require \"conf.pl\"\n";
364 require "util.pl" || die "unable to require \"util.pl\"\n";
365 require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
366 require "ext.pl" || die "unable to require \"ext.pl\"\n";
369 sub postload_modules
() {
370 require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
373 sub load_modules
() {
374 require "usage.pl" || die "unable to require \"usage.pl\"\n";
375 require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
376 require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
377 require "seed.pl" || die "unable to require \"seed.pl\"\n";
378 require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
380 @Seed = seed
::init
();
383 sub load_filtermodules
() {
384 unshift @INC, $FILTERDIR;
387 # Windows modules must be loaded first.
388 # Because OLE filters have low precedence over normal ones.
390 load_win32modules
() if $English::OSNAME
eq "MSWin32";
392 # Check filter modules
394 @filters = glob "$FILTERDIR/*.pl";
396 load_filters
(@filters);
399 sub load_win32modules
() {
400 # Check filter modules
402 if (-f
"../filter/win32/olemsword.pl") { # to ease developing
403 @filters = glob "../filter/win32/*.pl";
404 unshift @INC, "../filter/win32";
406 @filters = glob "$FILTERDIR/win32/*.pl";
407 unshift @INC, "$FILTERDIR/win32";
410 load_filters
(@filters);
413 sub load_filters
(@
) {
416 for my $filter (@filters) {
417 $filter =~ m!([-\w]+)\.pl$!;
419 require "$module.pl" || die "unable to require \"$module.pl\"\n";;
420 my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
422 eval "\@mtypes = ${module}::mediatype();";
423 die $@
if $@
; # eval error
424 eval "\$status = ${module}::status();";
426 eval "\$recursive = ${module}::recursive();";
428 eval "\$pre_codeconv = ${module}::pre_codeconv();";
430 eval "\$post_codeconv = ${module}::post_codeconv();";
432 eval "${module}::add_magic(\$Magic);";
435 for my $mt (@mtypes) {
436 next if (defined $var::Supported
{$mt} &&
437 $var::Supported
{$mt} eq 'yes' && $status eq 'no');
438 $var::Supported
{$mt} = $status;
439 $var::REQUIRE_ACTIONS
{$mt} = $module;
440 $var::RECURSIVE_ACTIONS
{$mt} = $recursive;
441 $var::REQUIRE_PRE_CODECONV
{$mt} = $pre_codeconv;
442 $var::REQUIRE_POST_CODECONV
{$mt} = $post_codeconv;
449 # FIXME: Too many parameters. They must be cleared.
451 sub namazu_core
($$$$$$$$$$) {
452 my ($cfile, $docid_count, $docid_base,
453 $file_count, $field_indices, $fh_errorsfile, $total_files_num,
454 $uri, $part_id, $part_num) = @_;
458 my $weighted_str = "";
463 $msg_prefix = " $part_id/$part_num - ";
465 $msg_prefix = $file_count + 1 . "/$total_files_num - ";
469 $uri = generate_uri
($cfile); # Make a URI from a file name.
471 my ($cfile_size, $text_size, $kanji, $mtype) =
472 load_document
(\
$cfile, \
$content, \
$weighted_str,
473 \
$headings, \
%fields);
476 $fields{'mtime'} = (stat($cfile))[9];
477 my $utc = $fields{'mtime'};
478 $utc = time::rfc822time_to_mtime
($fields{'date'})
479 if (defined $fields{'date'});
481 my $date = $fields{'date'};
482 print "$cfile Illegal date format. : $date\n";
483 print $fh_errorsfile "$cfile Illegal date format. : $date\n";
484 $utc = $fields{'mtime'};
485 delete $fields{'date'};
487 $fields{'utc'} = $utc;
490 util
::dprint
(_
("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
492 # Check if the file is acceptable.
493 my $err = check_file
($cfile, $cfile_size, $text_size, $mtype, $uri);
495 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
496 my $uri2 = codeconv
::eucjp_to_shiftjis
($uri);
497 print $msg_prefix . "$uri2 $err\n";
499 print $msg_prefix . "$uri $err\n";
501 print $fh_errorsfile "$cfile $err\n";
502 return 0; # return 0 if error
505 # Print processing file name as URI.
506 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
507 my $uri2 = codeconv
::eucjp_to_shiftjis
($uri);
508 print $msg_prefix . "$uri2 [$mtype]\n";
510 print $msg_prefix . "$uri [$mtype]\n";
514 my $filename = defined $cfile ?
$cfile : '';
515 codeconv
::toeuc
(\
$filename);
516 $filename = basename
($filename);
517 $fields{'filename'} = $filename;
519 complete_field_info
(\
%fields, $cfile, $uri,
520 \
$headings, \
$content, \
$weighted_str);
521 put_field_index
(\
%fields, $field_indices);
523 put_dateindex
($cfile);
524 $content .= "\n\n$filename\n\n"; # add filename
525 $content .= $weighted_str; # add weights
526 count_words
($docid_count, $docid_base, \
$content, $kanji);
527 make_phrase_hash
($docid_count, $docid_base, \
$content);
530 util
::assert
($cfile_size != 0,
531 "cfile_size == 0 at the end of namazu_core.");
537 # Make the URI from the given file name.
539 sub generate_uri
(@
) {
540 my ($file, $fragment) = @_;
541 return "" unless defined $file;
543 # omit a file name if omittable
544 $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
546 if (defined $ReplaceCode) {
547 # transforming URI by evaling
553 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
554 $file =~ s
#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
557 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
558 $file = codeconv
::shiftjis_to_eucjp
($file);
560 if (defined $fragment) {
561 codeconv
::toeuc
(\
$fragment);
564 unless ($var::Opt
{'noencodeuri'}) {
565 for my $tmp ($file, $fragment) {
566 next unless defined $tmp;
568 # Escape unsafe characters (not strict)
569 $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2
570 $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
571 sprintf("%%%02X",ord($1))/ge;
577 $uri .= "#" . $fragment if defined $fragment;
578 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
579 # restore '|' for drive letter rule of Win32, OS/2
580 $uri =~ s!^/([A-Z])%7C!/$1|!i;
586 sub get_field_index_base
(\
%) {
587 my ($field_indices) = @_;
589 my @keys = split('\|', $conf::SEARCH_FIELD
);
590 if ($var::Opt
{'meta'}) {
591 push @keys, (split '\|', $conf::META_TAGS
);
593 for my $key (@keys) {
595 my $fname = "$var::NMZ{'field'}.$key";
596 my $tmp_fname = util
::tmpnam
("NMZ.field.$key");
598 $size = -s
$fname if -f
$fname;
599 $size += -s
$tmp_fname if -f
$tmp_fname;
600 $field_indices->{$key} = $size;
604 sub complete_field_info
(\
%$$\
$\
$\
$) {
605 my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
607 for my $field (keys %{$fields}) {
608 if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
609 delete $fields->{$field};
613 unless (defined($fields->{'title'})) {
614 $fields->{'title'} = gfilter
::filename_to_title
($cfile, $wsref);
616 unless (defined($fields->{'date'})) {
617 my $mtime = $fields->{'mtime'};
618 my $date = util
::rfc822time
($mtime);
619 $fields->{'date'} = $date;
621 unless (defined($fields->{'uri'})) {
622 $fields->{'uri'} = $uri;
624 unless (defined($fields->{'size'})) {
625 $fields->{'size'} = -s
$cfile;
627 unless (defined($fields->{'summary'})) {
628 $fields->{'summary'} = make_summary
($contref, $headings, $cfile);
630 unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
631 $fields->{'from'} = getmsg
("unknown");
636 # Currently, messages for NMZ.* files should be encoded in
637 # EUC-JP currently. We cannot use gettext.pl for the messsage
638 # because gettext.pl may use Shift_JIS encoded messages.
639 # So, we should use the function instead of gettext().
641 # FIXME: Ad hoc impl. getmsg() is effective only for "unknown".
646 if (util
::islang_msg
("ja")) {
647 if ($msg eq "unknown") {
654 sub make_summary
($$$) {
655 my ($contref, $headings, $cfile) = @_;
657 # pick up $conf::MAX_FIELD_LENGTH bytes string
659 if ($$headings ne "") {
660 $$headings =~ s/^\s+//;
661 $$headings =~ s/\s+/ /g;
669 while (($tmplen = $conf::MAX_FIELD_LENGTH
+ 1 - length($tmp)) > 0
670 && $offset < length($$contref))
672 $tmp .= substr $$contref, $offset, $tmplen;
674 $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
675 $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
679 my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH
- 1;
680 # Remove a garbage Kanji 1st char at the end.
681 $summary = codeconv
::chomp_eucjp
($summary);
683 $summary =~ s/^\s+//;
684 $summary =~ s/\s+/ /g; # normalize white spaces
690 # output the field infomation into NMZ.fields.* files
691 sub put_field_index
(\
%$) {
692 my ($fields, $field_indices) = @_;
695 join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES
);
697 for my $field (keys %{$fields}) {
698 util
::dprint
("Field: $field: $fields->{$field}\n");
699 if ($field =~ /^($aliases_regex)$/o) {
700 unless (defined($fields->{$conf::FIELD_ALIASES
{$field}})) {
701 $fields->{$conf::FIELD_ALIASES
{$field}} = $fields->{$field};
703 undef $fields->{$field};
707 my @keys = split '\|', $conf::SEARCH_FIELD
;
708 if ($var::Opt
{'meta'}) {
709 my @meta = split '\|', $conf::META_TAGS
;
710 while (my $meta = shift(@meta)) {
711 $meta = $conf::FIELD_ALIASES
{$meta}
712 if (defined $conf::FIELD_ALIASES
{$meta});
719 @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
721 for my $key (@keys) {
723 my $fname = util
::tmpnam
("NMZ.field.$lkey");
724 my $fh_field = util
::efopen
(">>$fname");
726 if (defined($fields->{$key})) {
727 if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
728 $fields->{$key} =~ s/\s+/ /g;
729 $fields->{$key} =~ s/\s+$//;
730 $fields->{$key} =~ s/^\s+//;
732 $output = $fields->{$key};
735 $output = substr $output, 0, $conf::MAX_FIELD_LENGTH
- 1;
736 # Remove a garbage Kanji 1st char at the end.
737 $output = codeconv
::chomp_eucjp
($output);
739 $output =~ s/\n.*$//s;
744 print $fh_field $output;
745 util
::fclose
($fh_field);
747 # put index of field index
749 my $fname = util
::tmpnam
("NMZ.field.$lkey.i");
750 my $fh_field_idx = util
::efopen
(">>$fname");
751 print $fh_field_idx pack("N", $field_indices->{$lkey});
752 $field_indices->{$lkey} += length $output;
753 util
::fclose
($fh_field_idx);
759 # put the date infomation into NMZ.t file
760 sub put_dateindex
($) {
762 my $mtime = (stat($cfile))[9];
764 my $fh_dataindex = util
::efopen
(">>$var::NMZ{'_t'}");
765 print $fh_dataindex pack("N", $mtime);
766 util
::fclose
($fh_dataindex);
770 # load a document file
771 sub load_document
($$$$\
%) {
772 my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
774 my $cfile = $$orig_cfile;
776 return (0, 0, 0, 0) unless (-f
$cfile && util
::canopen
($cfile));
778 # for handling a filename which contains Shift_JIS code for Windows.
779 # for handling a filename which contains including space.
780 my $shelter_cfile = "";
781 if (($cfile =~ /\s/) ||
782 ($English::OSNAME
eq "MSWin32"
783 && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
785 $shelter_cfile = $cfile;
786 $cfile = util
::tmpnam
("NMZ.win32");
787 unlink $cfile if (-e
$cfile);
788 copy
($shelter_cfile, $cfile);
792 $file_size = util
::filesize
($cfile); # not only file in feature.
793 if ($file_size > $conf::FILE_SIZE_MAX
) {
794 return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
797 $$contref = util
::readfile
($cfile);
798 # $file_size = length($$contref);
800 my ($kanji, $mtype) = apply_filter
($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
802 if ($English::OSNAME
eq "MSWin32" && $shelter_cfile ne "") {
804 $cfile = $shelter_cfile;
807 # Measure the text size at this time.
808 my $text_size = length($$contref) + length($$weighted_str);
810 return ($file_size, $text_size, $kanji, $mtype);
813 sub apply_filter
($$$$$$$) {
814 my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
816 my $cfile = $shelter_cfile ne "" ?
$shelter_cfile : $$orig_cfile;
822 if (defined $MediaType) {
824 } elsif (defined $mmtype) {
827 my $mtype_n = $Magic->checktype_byfilename($cfile);
828 my $mtype_c = $Magic->checktype_data($$contref);
830 $mtype_m = $Magic->checktype_magic($$contref)
831 if ((! defined $mtype_c) ||
833 /^(text\/html
|text\
/plain|application\/octet
-stream
)$/);
835 if (defined $mtype_m &&
837 /^(text\/html
|text\
/plain|application\/octet
-stream
)$/);
838 $mtype_c = 'text/plain' unless defined $mtype_c;
842 $mtype = decide_type
($mtype_n, $mtype_c);
846 util
::dprint
(_
("Detected type: ")."$mtype\n");
848 # Pre code conversion.
849 if ($var::REQUIRE_PRE_CODECONV
{$mtype}) {
850 util
::dprint
("pre_codeconv\n");
851 codeconv_document
($contref);
854 if (! $var::Supported
{$mtype} ||
855 $var::Supported
{$mtype} ne 'yes')
857 util
::vprint
(_
("Unsupported media type ")."$mtype\n");
858 return (0, "$mtype; x-system=unsupported");
861 if ($var::REQUIRE_ACTIONS
{$mtype}) {
862 util
::vprint
(_
("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
863 require $var::REQUIRE_ACTIONS
{$mtype}.'.pl'
864 || die _
("unable to require ") .
865 "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
868 local $SIG{'PIPE'} = \
&trapintr
;
869 eval '$err = ' . $var::REQUIRE_ACTIONS
{$mtype} .
870 '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
873 if ($err =~ m/; x-system=unsupported$/) {
876 return (0, "$mtype; x-error=$err");
880 util
::vprint
(_
("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
881 return (0, "$mtype; x-error=$@");
884 # Post code conversion.
885 if ($var::REQUIRE_POST_CODECONV
{$mtype}) {
886 util
::dprint
("post_codeconv\n");
887 codeconv_document
($contref);
890 next if ($var::RECURSIVE_ACTIONS
{$mtype});
895 my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/; # Kanji contained?
896 $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
898 return ($kanji, $mtype);
901 sub codeconv_document
($) {
903 codeconv
::toeuc
($textref);
904 $$textref =~ s/\r\n/\n/g;
905 $$textref =~ s/\r/\n/g;
906 $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
911 my $output_dir = shift @_ ;
915 $var::OUTPUT_DIR
= $output_dir;
919 check_present_index
();
921 # if Checkpoint mode, return
922 return (0, 0) if $CheckPoint{'continue'};
924 check_lockfile
($var::NMZ
{'lock2'});
925 print _
("Looking for indexing files...\n");
926 @flist = find_target
(@targets);
927 ($docid_base, @flist) = append_index
(@flist)
928 if -f
$var::NMZ
{'r'};
929 unless (@flist) { # if @flist is empty
930 print _
("No files to index.\n");
933 set_lockfile
($var::NMZ
{'lock2'});
935 my $total_files_num = @flist;
937 return ($docid_base, $total_files_num);
942 return if (@flist == 0);
944 my $fh_flist = util
::efopen
(">$var::NMZ{'_flist'}");
945 print $fh_flist join("\n", @flist), "\n";
946 util
::fclose
($fh_flist);
949 sub require_modules
() {
950 if (util
::islang
("ja") && $conf::NKF
=~ /^module_nkf/) {
951 require NKF
|| die "unable to require \"NKF\"\n";
952 util
::dprint
(_
("code conversion: using NKF module\n"));
953 $var::USE_NKF_MODULE
= 1;
955 if (util
::islang
("ja") && $conf::WAKATI
=~ /^module_kakasi/) {
956 require Text
::Kakasi
|| die "unable to require \"Text::Kakasi\"\n";
957 util
::dprint
(_
("wakati: using Text::Kakasi module\n"));
958 my $res = Text
::Kakasi
::getopt_argv
('kakasi', '-ieuc', '-oeuc', '-w');
960 if (util
::islang
("ja") && $conf::WAKATI
=~ /^module_chasen/) {
961 require Text
::ChaSen
|| die "unable to require \"Text::ChaSen\"\n";
962 util
::dprint
(_
("wakati: using Text::ChaSen module\n"));
963 my @arg = ('-i', 'e', '-j', '-F', '%m ');
964 @arg = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt
{'noun'};
965 my $res = Text
::ChaSen
::getopt_argv
('chasen-perl', @arg);
967 if (util
::islang
("ja") && $conf::WAKATI
=~ /^module_mecab/) {
968 require MeCab
|| die "unable to require \"MeCab\"\n";
969 util
::dprint
(_
("wakati: using MeCab module\n"));
973 sub check_lockfile
($) {
974 # warn if check file exists in case other process is running or abnormal
975 # stop execution (later is not the major purpose, though).
976 # This is mainly for early detection before longish find_target.
980 print "$file "._
("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
985 sub set_lockfile
($) {
990 print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
993 my $fh_lockfile = util
::efopen
(">$file");
994 print $fh_lockfile "$$"; # save pid
995 util
::fclose
($fh_lockfile);
999 sub remove_lockfile
($) {
1003 unlink $file if -f
$file;
1006 # check present index whether it is old type of not
1007 sub check_present_index
() {
1008 if (-f
$var::NMZ
{'i'} && ! -f
"$var::NMZ{'wi'}")
1010 util
::cdie
(_
("Present index is old type. it's unsupported.\n"));
1015 sub do_remain_job
($$$$) {
1016 my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
1018 if ($docid_count == 0) {
1019 # No files are indexed
1020 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1022 update_registry
($docid_count);
1025 set_lockfile
($var::NMZ
{'lock'});
1029 write_result_file
();
1030 update_field_index
();
1032 update_registry
($docid_count);
1035 remove_lockfile
($var::NMZ
{'lock'});
1037 make_headfoot_pages
($docid_count, $key_count);
1038 put_log
($total_files_size, $start_time, $docid_count, $key_count);
1039 util
::remove_tmpfiles
();
1040 unlink $var::NMZ
{'_flist'};
1043 sub make_headfoot_pages
($$) {
1044 my ($docid_count, $key_count) = @_;
1046 for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
1047 if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
1048 my $suffix = $1 ?
$1 : '';
1049 make_headfoot
("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
1052 for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
1053 if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
1054 my $suffix = $1 ?
$1 : '';
1055 make_headfoot
("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
1060 # Parse command line options.
1069 my $targets_loaded = 0;
1074 my $opt_version = 0;
1078 my $opt_chasen_noun = 0;
1081 my $opt_checkpoint_sub = 0;
1082 my $opt_show_config = 0;
1083 my $opt_mailnews = 0;
1084 my $opt_mhonarc = 0;
1087 my $opt_quiet = undef;
1088 my $opt_config = undef;
1089 my $output_dir = undef;
1090 my $update_index = undef;
1091 my $include_file = undef;
1092 my $target_list = undef;
1093 my $index_lang = undef;
1097 # Getopt::Long::Configure('bundling');
1098 Getopt
::Long
::config
('bundling');
1100 '0|help' => \
$opt_help,
1101 '1|exclude=s' => \
$opt_conf{'EXCLUDE_PATH'},
1102 '2|deny=s' => \
$opt_conf{'DENY_FILE'},
1103 '3|allow=s' => \
$opt_conf{'ALLOW_FILE'},
1104 '4|update=s' => \
$update_index,
1105 '5|mhonarc' => \
$opt_mhonarc,
1106 '6|mtime=s' => \
$var::Opt
{'mtime'},
1107 '7|html-split' => \
$var::Opt
{'htmlsplit'},
1108 'C|show-config' => \
$opt_show_config,
1109 'E|no-edge-symbol' => \
$var::Opt
{'noedgesymbol'},
1110 'F|target-list=s' => \
$target_list,
1111 'G|no-okurigana' => \
$var::Opt
{'okurigana'},
1112 'H|no-hiragana' => \
$var::Opt
{'hiragana'},
1113 'I|include=s' => \
$include_file,
1114 'K|no-symbol' => \
$var::Opt
{'nosymbol'},
1115 'L|indexing-lang=s' => \
$index_lang,
1116 'M|meta' => \
$var::Opt
{'meta'},
1117 'O|output-dir=s' => \
$output_dir,
1118 'S|checkpoint-sub' => \
$opt_checkpoint_sub,
1119 'T|template-dir=s' => \
$TEMPLATEDIR,
1120 'U|no-encode-uri' => \
$var::Opt
{'noencodeuri'} ,
1121 'V|verbose' => \
$var::Opt
{'verbose'},
1122 'Y|no-delete' => \
$var::Opt
{'nodelete'},
1123 'Z|no-update' => \
$var::Opt
{'noupdate'},
1124 'a|all' => \
$opt_all,
1125 'b|use-mecab' => \
$opt_mecab,
1126 'c|use-chasen' => \
$opt_chasen,
1127 'd|debug' => \
$var::Opt
{'debug'},
1128 'e|robots' => \
$var::Opt
{'robotexclude'},
1129 'f|config=s' => \
$opt_config,
1130 'h|mailnews' => \
$opt_mailnews,
1131 'k|use-kakasi' => \
$opt_kakasi,
1132 'm|use-chasen-noun' => \
$opt_chasen_noun,
1133 'q|quiet' => \
$opt_quiet,
1134 'r|replace=s' => \
$ReplaceCode,
1135 's|checkpoint' => \
$CheckPoint{'on'},
1136 't|media-type=s' => \
$MediaType,
1137 'u|uuencode' => \
$opt_dummy, # for backward compat.
1138 'v|version' => \
$opt_version,
1139 'x|no-heading-summary'=> \
$var::Opt
{'noheadabst'},
1140 'z|check-filesize' => \
$var::Opt
{'checkfilesize'},
1141 'decode-base64' => \
$var::Opt
{'decodebase64'},
1142 'norc' => \
$opt_norc,
1146 # Make STDOUT quiet by redirecting STDOUT to null device.
1147 my $devnull = util
::devnull
();
1148 open(STDOUT
, ">$devnull") || die "$devnull: $!";
1151 if (defined $update_index) {
1152 unless (-d
$update_index) {
1153 print _
("No such index: "), "$update_index\n";
1157 my $orig_status = $var::NMZ
{'status'};
1158 $var::NMZ
{'status'} = "$update_index/$var::NMZ{'status'}";
1160 my $argv = get_status
("argv");
1161 if (!defined $argv) {
1162 print _
("No such index: "), "$update_index\n";
1165 @ARGV = split /\t/, $argv;
1166 util
::dprint
(_
("Inherited argv: ")."@ARGV\n");
1168 my $cwd = get_status
("cwd");
1169 if (!defined $cwd) {
1170 print _
("No such index: "), "$update_index\n";
1174 util
::dprint
(_
("Inherited cwd: ")."$cwd\n");
1176 ($output_dir, @targets) = parse_options
();
1177 $output_dir = $update_index;
1178 $var::NMZ
{'status'} = $orig_status; # See also change_filenames()
1179 return ($output_dir, @targets);
1182 if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
1186 if (-f
$opt_config) {
1187 util
::vprint
(_
("Reading rcfile: "));
1188 load_rcfile
($ConfigFile = $opt_config);
1189 util
::vprint
(" $opt_config\n");
1194 $util::LANG
= $index_lang;
1195 util
::dprint
("Override indexing language: $util::LANG\n");
1208 load_filtermodules
(); # to make effect $opt_config, $index_lang.
1211 foreach my $key (keys %opt_conf){
1212 if (defined ($opt_conf{$key})) {
1213 ${*{$conf::{$key}}{SCALAR
}} = $opt_conf{$key};
1217 if ($opt_mailnews) {
1218 $MediaType = 'message/rfc822';
1221 $MediaType = 'text/html; x-type=mhonarc';
1224 $conf::ALLOW_FILE
= ".*";
1227 $conf::WAKATI
= $conf::CHASEN
;
1228 $var::Opt
{'noun'} = 0;
1230 if ($opt_chasen_noun) {
1231 $conf::WAKATI
= $conf::CHASEN_NOUN
;
1232 $var::Opt
{'noun'} = 1;
1235 $conf::WAKATI
= $conf::KAKASI
;
1236 $var::Opt
{'noun'} = 0;
1239 $conf::WAKATI
= $conf::MECAB
;
1240 $var::Opt
{'noun'} = 0;
1242 if ($include_file) {
1244 util
::dprint
("Included: $include_file\n");
1247 if ($CheckPoint{'continue'}) {
1248 @targets = ("dummy");
1250 @targets = load_target_list
($target_list);
1251 util
::dprint
(_
("Loaded: ")."$target_list\n");
1253 $targets_loaded = 1;
1255 if ($opt_checkpoint_sub) {
1256 $CheckPoint{'on'} = 1;
1257 $CheckPoint{'continue'} = 1;
1258 @argv = grep {! /^-S$/} @argv; # remove -S
1261 if (defined $ReplaceCode) {
1262 my $orig = "/foo/bar/baz/quux.html";
1265 if ($@
) { # eval error
1266 util
::cdie
(_
("Invalid replace: ")."$ReplaceCode\n");
1268 util
::dprint
(_
("Replace: ")."$orig -> $_\n");
1271 if ($opt_show_config) {
1276 if (@ARGV == 0 && $targets_loaded == 0) {
1281 $output_dir = $cwd unless defined $output_dir;
1282 util
::cdie
("$output_dir: "._
("invalid output directory\n"))
1283 unless (-d
$output_dir && -w
$output_dir);
1285 if ($English::OSNAME
eq "MSWin32" || $English::OSNAME
eq "os2") {
1286 util
::win32_yen_to_slash
(\
$output_dir);
1289 # take remaining @ARGV as targets
1290 if (@ARGV > 0 && $targets_loaded == 0) {
1295 # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
1298 return ($output_dir, @targets);
1301 sub show_config
() {
1302 print _
("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
1303 print _
("System: ") . "$English::OSNAME\n" if $English::OSNAME
;
1304 print _
("Namazu: ") . "$var::VERSION\n" if $var::VERSION
;
1305 print _
("Perl: ") . sprintf("%f\n", $English::PERL_VERSION
);
1306 print _
("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic
::VERSION
;
1307 print _
("NKF: ") . "$conf::NKF\n" if $conf::NKF
;
1308 print _
("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI
;
1309 print _
("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN
;
1310 print _
("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB
;
1311 print _
("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI
;
1312 print _
("Lang_Msg: ") . "$util::LANG_MSG\n";
1313 print _
("Lang: ") . "$util::LANG\n";
1314 print _
("Coding System: ") . "$CodingSystem\n";
1315 print _
("CONFDIR: ") . "$CONFDIR\n";
1316 print _
("LIBDIR: ") . "$LIBDIR\n";
1317 print _
("FILTERDIR: ") . "$FILTERDIR\n";
1318 print _
("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
1320 my @all_types = keys %var::Supported
;
1321 my @supported = sort grep { $var::Supported
{$_} eq "yes" } @all_types;
1323 my $num_supported = @supported;
1324 my $num_unsupported = @all_types - @supported;
1325 print _
("Supported media types: ") . "($num_supported)\n";
1326 print _
("Unsupported media types: ") . "($num_unsupported) " . _
("marked with minus (-) probably missing application in your \$path.\n");
1327 for my $mtype (sort keys %var::Supported
) {
1328 my $yn = $var::Supported
{$mtype};
1329 if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
1331 if ($var::REQUIRE_ACTIONS
{$mtype}){
1332 print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
1338 sub load_target_list
($) {
1340 my $fh_targets = util
::efopen
($file);
1341 my @targets = <$fh_targets>;
1342 util
::fclose
($fh_targets);
1343 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
1344 foreach my $tmp (@targets){
1346 util
::win32_yen_to_slash
(\
$tmp);
1353 # convert a relative path into an absolute path
1354 sub absolute_path
($$) {
1355 my ($cwd, $path) = @_;
1357 $path =~ s!^\.$!\./!;
1358 $path =~ s!^\.[/\\]!$cwd/!;
1359 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
1360 util
::win32_yen_to_slash
(\
$path);
1361 if ($path =~ m!^//!) {
1362 } elsif ($path =~ m!^/[^/]!) {
1363 my $driveletter = $cwd;
1364 if ($driveletter =~ m!^([A-Z]:)!i){
1367 $path = "$driveletter$path";
1368 } elsif ($path !~ m!^[A-Z]:/!i) {
1369 $path = "$cwd/$path";
1372 $path =~ s!^([^/])!$cwd/$1!;
1377 sub find_target
(@
) {
1381 @targets = map { absolute_path
($cwd, $_) } @targets;
1383 # Convert \ to / with consideration for Shift_JIS encoding.
1384 if (($English::OSNAME
eq "MSWin32") || ($English::OSNAME
eq "os2")) {
1385 foreach my $tmp (@targets){
1386 util
::win32_yen_to_slash
(\
$tmp);
1390 # For reporting effects of --allow, --deny, --exclude, --mtime
1391 # options in --verbose mode.
1393 $counts{'possible'} = 0;
1394 $counts{'excluded'} = 0;
1395 $counts{'too_old'} = 0;
1396 $counts{'too_new'} = 0;
1397 $counts{'not_allowed'} = 0;
1398 $counts{'denied'} = 0;
1400 # Traverse directories.
1401 # This routine is not efficent but I prefer reliable logic.
1404 util
::vprint
(_
("find_target starting: "). localtime($start). "\n");
1406 my $target = shift @targets;
1408 if ($target eq '') {
1409 print STDERR
"Warning: target contains empty line, skip it\n";
1413 if (-f
$target) { # target is a file.
1414 add_target
($target, \
@flist, \
%counts);
1415 } elsif (-d
$target) { # target is a directory.
1416 my @subtargets = ();
1417 # Find subdirectories in target directory
1418 # because File::Find::find() does not follow symlink.
1420 my $dh = new DirHandle
($target);
1421 while (defined(my $ent = $dh->read)) {
1422 next if ($ent =~ /^\.{1,2}$/);
1423 if ($English::OSNAME
eq "MSWin32" || $English::OSNAME
eq "os2") {
1424 next if ($ent =~ m!^($conf::DENY_DDN)$!i);
1426 util
::win32_yen_to_slash
(\
$tmp);
1427 next if ($ent ne $tmp);
1429 my $fname = "$target/$ent";
1430 next if ($fname eq '.' || $fname eq '..');
1432 push(@subtargets, $fname);
1434 add_target
($fname, \
@flist, \
%counts);
1438 @subtargets = ($target);
1442 # Wanted routine for File::Find's find().
1444 my $wanted_closure = sub {
1445 my $fname = "$File::Find::dir/$_";
1446 add_target
($fname, \
@flist, \
%counts);
1449 find
($wanted_closure, @subtargets) if (@subtargets > 0);
1451 print STDERR _
("unsupported target: ") . $target;
1457 @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
1459 # Sort file names with consideration for numbers.
1460 @flist = map { $_->[0] }
1461 sort { $a->[1] cmp $b->[1] }
1462 map { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
1463 [ $_, $tmp ] } @flist;
1465 my $elapsed = time() - $start ;
1466 $elapsed += 1 ; # to round up and avoid 0
1468 # For --verbose option.
1469 report_find_target
($elapsed, $#flist + 1, %counts);
1474 sub add_target
($\@\
%) {
1475 my ($target, $flists_ref, $counts_ref) = @_;
1477 if ($target =~ /[\n\r\t]/) {
1478 $target =~ s/[\n\r\t]//g;
1479 print STDERR
"Warning: $target contains LF/CR/TAB chars, skip it\n";
1480 return; # skip a file name containing LF/CR/TAB chars.
1483 return unless -f
$target; # Only file is targeted.
1485 $counts_ref->{'possible'}++;
1487 unless (util
::canopen
($target)) {
1488 util
::vprint
(sprintf(_
("Unreadable: %s"), $target));
1489 $counts_ref->{'excluded'}++;
1494 if (defined $conf::EXCLUDE_PATH
&&
1495 $target =~ /$conf::EXCLUDE_PATH/ )
1497 util
::vprint
(sprintf(_
("Excluded: %s"), $target));
1498 $counts_ref->{'excluded'}++;
1503 # Do processing just like find's --mtime option.
1505 if (defined $var::Opt
{'mtime'}) {
1507 if ($var::Opt
{'mtime'} < 0) {
1509 # This must be `>=' not `>' for consistency with find(1).
1510 if (int($mtime) >= - $var::Opt
{'mtime'}) {
1511 util
::vprint
(sprintf(_
("Too old: %s"), $target));
1512 $counts_ref->{'too_old'}++;
1515 } elsif ($var::Opt
{'mtime'} > 0) {
1516 if ($var::Opt
{'mtime'} =~ /^\+/) {
1517 if ((int($mtime) < $var::Opt
{'mtime'})) {
1518 util
::vprint
(sprintf(_
("Too new: %s"), $target));
1519 $counts_ref->{'too_new'}++;
1523 if (int($mtime) != $var::Opt
{'mtime'}) {
1524 if (int($mtime) > $var::Opt
{'mtime'}) {
1525 util
::vprint
(sprintf(_
("Too old: %s"),$target));
1526 $counts_ref->{'too_old'}++;
1528 util
::vprint
(sprintf(_
("Too new: %s"),$target));
1529 $counts_ref->{'too_new'}++;
1535 # $var::Opt{'mtime'} == 0 ;
1540 # Extract the file name of the target.
1541 $target =~ m!^.*/([^/]+)$!;
1544 if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
1545 util
::vprint
(sprintf(_
("Denied: %s"), $target));
1546 $counts_ref->{'denied'}++;
1549 if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
1550 util
::vprint
(sprintf(_
("Not allowed: %s"), $target));
1551 $counts_ref->{'not_allowed'}++;
1554 util
::vprint
(sprintf(_
("Targeted: %s"), $target));
1555 push @
$flists_ref, $target;
1560 sub report_find_target
($$%) {
1561 my ($elapsed, $num_targeted, %counts) = @_;
1563 util
::vprint
(_
("find_target finished: ") . localtime(time()). "\n");
1564 util
::vprint
(sprintf(_
("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
1565 $num_targeted, $elapsed,
1566 $num_targeted /$elapsed));
1567 util
::vprint
(sprintf(_
(" Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
1568 $counts{'possible'},
1569 $counts{'not_allowed'},
1571 $counts{'excluded'}));
1572 util
::vprint
(sprintf(_
(" MTIME too old: %d, MTIME too new: %d"),
1574 $counts{'too_new'}));
1578 util
::dprint
(_
("lang_msg: ")."$util::LANG_MSG\n");
1579 util
::dprint
(_
("lang: ")."$util::LANG\n");
1581 my $usage = $usage::USAGE
;
1583 printf "$usage", $var::VERSION
, $var::TRAC_URI
, $var::MAILING_ADDRESS
;
1586 sub show_mini_usage
() {
1587 print _
("Usage: mknmz [options] <target>...\n");
1588 print _
("Try `mknmz --help' for more information.\n");
1591 sub show_version
() {
1592 print $usage::VERSION_INFO
;
1596 # check the file. No $msg is good.
1598 sub check_file
($$$$$) {
1599 my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
1602 if ($mtype =~ /; x-system=unsupported$/) {
1603 $mtype =~ s/; x-system=unsupported$//;
1604 $msg = _
("Unsupported media type ")."($mtype)"._
(" skipped.");
1605 } elsif ($mtype =~ /; x-error=file_size_max/) {
1606 $msg = _
("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX
. ') < '. $cfile_size ;
1607 } elsif ($mtype =~ /; x-error=.*$/) {
1608 $mtype =~ s/^.*; x-error=(.*)$/$1/;
1610 } elsif ($mtype =~ /^x-system/) {
1611 $msg = _
("system error occurred! ")."($mtype)"._
(" skipped.");
1612 } elsif (! -e
$cfile) {
1613 $msg = _
("does NOT EXIST! skipped.");
1614 } elsif (! util
::canopen
($cfile)) {
1615 $msg = _
("is NOT READABLE! skipped.");
1616 } elsif ($text_size == 0 || $cfile_size == 0) {
1617 $msg = _
("is 0 size! skipped.");
1618 } elsif ($mtype =~ /^application\/octet
-stream
/) {
1619 $msg = _
("may be a BINARY file! skipped.");
1620 } elsif ($cfile_size > $conf::FILE_SIZE_MAX
) {
1621 $msg = _
("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX
. ') < '. $cfile_size ;
1622 } elsif ($text_size > $conf::TEXT_SIZE_MAX
) {
1623 $msg = _
("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX
. ') < '. $text_size ;
1631 # Write NMZ.version file.
1633 sub write_version
() {
1634 unless (-f
$var::NMZ
{'version'}) {
1635 my $fh = util
::efopen
(">$var::NMZ{'version'}");
1636 print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
1642 # rename each temporary file to a real file name.
1644 sub write_nmz_files
() {
1645 util
::Rename
($var::NMZ
{'_i'}, $var::NMZ
{'i'});
1646 util
::Rename
($var::NMZ
{'_ii'}, $var::NMZ
{'ii'});
1647 util
::Rename
($var::NMZ
{'_w'}, $var::NMZ
{'w'});
1648 util
::Rename
($var::NMZ
{'_wi'}, $var::NMZ
{'wi'});
1649 util
::Rename
($var::NMZ
{'_p'}, $var::NMZ
{'p'});
1650 util
::Rename
($var::NMZ
{'_pi'}, $var::NMZ
{'pi'});
1654 sub write_body_msg
() {
1655 for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
1656 if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
1657 my $suffix = $1 ?
$1 : '';
1658 write_message
("$var::NMZ{'body'}${suffix}");
1664 sub write_tips_msg
() {
1665 for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
1666 if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
1667 my $suffix = $1 ?
$1 : '';
1668 write_message
("$var::NMZ{'tips'}${suffix}");
1674 # output NMZ.result.*
1675 sub write_result_file
() {
1676 my $fname = "NMZ.result.normal";
1678 my @files = glob "$TEMPLATEDIR/NMZ.result.*";
1680 for my $file (@files) {
1681 $file =~ m!(NMZ\.result\.[^/]*)$!;
1682 my $target = "$var::OUTPUT_DIR/$1";
1683 if (-f
$target) { # already exist;
1686 my $buf = util
::readfile
($file);
1687 my $fh_file = util
::efopen
(">$target");
1688 print $fh_file $buf;
1689 util
::fclose
($fh_file);
1694 # write NMZ.body and etc.
1695 sub write_message
($) {
1698 if (! -f
$msgfile) {
1699 my ($template, $fname);
1701 $msgfile =~ m!.*/(.*)$!;
1703 $template = "$TEMPLATEDIR/$fname";
1706 my $buf = util
::readfile
($template);
1707 my $fh_output = util
::efopen
(">$msgfile");
1708 print $fh_output $buf;
1709 util
::fclose
($fh_output);
1716 # Make the NMZ.slog file for logging.
1718 sub make_slog_file
() {
1719 if (! -f
$var::NMZ
{'slog'}) {
1720 my $fh = util
::efopen
(">$var::NMZ{'slog'}");
1723 chmod 0666, $var::NMZ
{'slog'};
1726 my $fh_slogfile = util
::efopen
(">>$var::NMZ{'slog'}");
1727 util
::fclose
($fh_slogfile);
1733 # Concatenate $CURRENTDIR to the head of each file.
1735 sub change_filenames
($) {
1736 my $dir = $var::OUTPUT_DIR
;
1738 for my $key (sort keys %var::NMZ
) {
1739 next if $key =~ /^_/; # exclude temporary file
1740 $var::NMZ
{$key} = "$dir/$var::NMZ{$key}";
1744 for my $key (sort keys %var::NMZ
) {
1746 $var::NMZ
{$key} = util
::tmpnam
($var::NMZ
{$key});
1750 if ($var::Opt
{'debug'}) {
1751 for my $key (sort keys %var::NMZ
) {
1752 util
::dprint
("NMZ: $var::NMZ{$key}\n");
1759 # Preparation processing for appending index files.
1761 sub append_index
(@
) {
1765 ($docid_base, @flist) = set_target_files
(@flist);
1767 unless (@flist) { # if @flist is empty
1768 if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1769 set_lockfile
($var::NMZ
{'lock2'});
1772 make_headfoot_pages
(0, get_total_keys
());
1773 put_log
(0, 0, 0, get_total_keys
());
1774 make_headfoot_pages
(get_status
("files"), get_status
("keys"));
1775 util
::remove_tmpfiles
();
1777 print _
("No files to index.\n");
1782 # conserve files by copying
1783 copy
($var::NMZ
{'i'}, $var::NMZ
{'_i'});
1784 copy
($var::NMZ
{'w'}, $var::NMZ
{'_w'});
1785 copy
($var::NMZ
{'t'}, $var::NMZ
{'_t'})
1786 unless -f
$var::NMZ
{'_t'}; # preupdated ?
1787 copy
($var::NMZ
{'p'}, $var::NMZ
{'_p'});
1788 copy
($var::NMZ
{'pi'}, $var::NMZ
{'_pi'});
1790 return ($docid_base, @flist);
1794 # Set target files to @flist and return with the number of regiested files.
1796 sub set_target_files
() {
1797 my %rdocs; # 'rdocs' means 'registered documents'
1798 my @found_files = @_;
1800 # Load the list of registered documents
1801 $rdocs{'name'} = load_registry
();
1803 # Pick up overlapped documents and do marking
1805 my @overlapped_files;
1806 grep {$_ !~ /^\# / && $mark1{$_}++ } @
{$rdocs{'name'}};
1807 $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
1808 for my $overlapped (grep { $mark1{$_} } @found_files) {
1809 $rdocs{'overlapped'}{$overlapped} = 1;
1810 push @overlapped_files, $overlapped;
1813 # Pick up not overlapped documents which are files to index.
1814 my @flist = grep { ! $mark1{$_} } @found_files;
1816 if ($var::Opt
{'noupdate'}) {
1817 return (scalar @
{$rdocs{'name'}}, @flist);
1820 # Load the date index.
1821 $rdocs{'mtime'} = load_dateindex
();
1823 if (@
{$rdocs{'mtime'}} == 0) {
1824 return (scalar @
{$rdocs{'name'}}, @flist);
1827 util
::assert
(@
{$rdocs{'name'}} == @
{$rdocs{'mtime'}},
1828 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
1830 # Pick up deleted documents and do marking
1831 # (registered in the NMZ.r but not existent in the filesystem)
1832 my @deleted_documents;
1833 unless ($var::Opt
{'nodelete'}) {
1835 grep { $mark2{$_}++ } @found_files;
1836 for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
1837 ! $rdocs{'overlapped'}{$_} }
1840 $rdocs{'deleted'}{$deleted} = 1;
1841 push @deleted_documents, $deleted;
1846 if ($var::Opt
{'checkfilesize'}) {
1847 $rdocs{'size'} = load_sizefield
();
1850 # Pick up updated documents and set the missing number for deleted files.
1851 my @updated_documents = pickup_updated_documents
(\
%rdocs);
1853 # Append updated files to the list of files to index.
1854 if (@updated_documents) {
1855 push @flist, @updated_documents;
1858 # Remove duplicates.
1860 @flist = grep { ! $seen{$_}++ } @flist;
1862 util
::dprint
(_
("\n\n== found files ==\n"), join("\n", @found_files), "\n");
1863 util
::dprint
(_
("\n\n== registered documents ==\n"), join("\n", @
{$rdocs{'name'}}), "\n");
1864 util
::dprint
(_
("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
1865 util
::dprint
(_
("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
1866 util
::dprint
(_
("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
1867 util
::dprint
(_
("\n\n== files to index ==\n"), join("\n", @flist), "\n");
1869 # Update NMZ.t with the missing number infomation and
1870 # append updated files and deleted files to NMZ.r with leading '# '
1871 if (@updated_documents || @deleted_documents) {
1872 $DeletedFilesCount = 0;
1873 $UpdatedFilesCount = 0;
1874 $UpdatedFilesCount += @updated_documents;
1875 # $DeletedFilesCount += @updated_documents;
1876 $DeletedFilesCount += @deleted_documents;
1877 preupdate_dateindex
(@
{$rdocs{'mtime'}});
1878 preupdate_registry
(@updated_documents, @deleted_documents);
1881 # Return the number of registered documents and list of files to index.
1882 return (scalar @
{$rdocs{'name'}}, @flist);
1885 sub preupdate_registry
(@
) {
1888 my $fh_registry = util
::efopen
(">$var::NMZ{'_r'}");
1889 @list = grep { s/(.*)/\# $1\n/ } @list;
1890 print $fh_registry @list;
1891 print $fh_registry &_
("## deleted: ") . util
::rfc822time
(time()) . "\n\n";
1892 util
::fclose
($fh_registry);
1895 sub preupdate_dateindex
(@
) {
1898 # Since rewriting the entire file, it is not efficient,
1899 # but simple and reliable. this would be revised in the future.
1900 my $fh_dateindex = util
::efopen
(">$var::NMZ{'_t'}");
1901 # print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
1902 print $fh_dateindex pack("N*", @mtimes);
1903 util
::fclose
($fh_dateindex);
1906 sub update_registry
($) {
1907 my ($docid_count) = @_;
1910 my $fh_registry = util
::efopen
(">>$var::NMZ{'r'}");
1911 my $fh_registry_ = util
::efopen
($var::NMZ
{'_r'});
1912 while (defined(my $line = <$fh_registry_>)) {
1913 print $fh_registry $line;
1915 if ($docid_count > 0) {
1916 print $fh_registry &_
("## indexed: ") . util
::rfc822time
(time()) . "\n\n";
1918 util
::fclose
($fh_registry_) if (defined $fh_registry_);
1919 util
::fclose
($fh_registry);
1921 unlink $var::NMZ
{'_r'};
1924 sub update_dateindex
() {
1925 util
::Rename
($var::NMZ
{'_t'}, $var::NMZ
{'t'});
1928 sub update_field_index
() {
1929 my @list = glob "$var::NMZ{'field'}.*.tmp";
1930 for my $tmp (@list) {
1931 if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
1935 my $fh_field = util
::efopen
(">>$fname_out");
1936 my $fh_tmp = util
::efopen
($fname_tmp);
1938 while (defined(my $line = <$fh_tmp>)) {
1939 print $fh_field $line;
1941 util
::fclose
($fh_tmp) if (defined $fh_tmp);
1942 util
::fclose
($fh_field);
1946 util
::cdie
(_
("update_field_index: ")."@list");
1951 sub pickup_updated_documents
(\
%) {
1952 my ($rdocs_ref) = @_;
1953 my @updated_documents = ();
1955 # To avoid duplicated outputs caused by --html-split support.
1958 for my $cfile (@
{$rdocs_ref->{'name'}}) {
1959 if (defined($rdocs_ref->{'deleted'}{$cfile})) {
1960 unless ($printed{$cfile}) {
1961 print "$cfile " . _
("was deleted!\n");
1962 $printed{$cfile} = 1;
1964 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1965 } elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
1966 my $cfile_mtime = (stat($cfile))[9];
1967 my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
1968 my ($cfile_size, $rfile_size);
1969 if ($var::Opt
{'checkfilesize'}) {
1970 $cfile_size = (stat($cfile))[7];
1971 $rfile_size = $rdocs_ref->{'size'}[$i];
1974 if ($rfile_mtime != $cfile_mtime ||
1975 ($var::Opt
{'checkfilesize'} && ($cfile_size != $rfile_size))) {
1976 # The file is updated!
1977 unless ($printed{$cfile}) {
1978 print "$cfile " . _
("was updated!\n");
1979 $printed{$cfile} = 1;
1981 push(@updated_documents, $cfile);
1982 $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1988 return @updated_documents
1991 sub load_dateindex
() {
1992 my $fh_dateindex = util
::efopen
($var::NMZ
{'t'});
1994 my $size = -s
$var::NMZ
{'t'};
1996 read($fh_dateindex, $buf, $size);
1997 my @list = unpack("N*", $buf); # load date index
1998 # print "\nload_dateindex\n", join("\n", @list), "\n\n";
2000 util
::fclose
($fh_dateindex);
2004 sub load_registry
() {
2005 my $fh_registry = util
::efopen
($var::NMZ
{'r'});
2009 my @registered = ();
2011 while (defined(my $line = <$fh_registry>)) {
2013 next if $line =~ /^\s*$/; # an empty line
2014 next if $line =~ /^##/; # a comment
2015 if ($line =~ s/^\#\s+//) { # deleted document
2018 # Remove HTML's anchor generated by --html-split option.
2019 $line =~ s/\t.*$//g;
2020 push @registered, $line;
2024 util
::fclose
($fh_registry) if (defined $fh_registry);
2026 # Exclude deleted documents.
2027 for my $doc (@registered) {
2028 if ($deleted{$doc}) {
2029 push @list, "# $doc";
2039 # get file size information from NMZ.field.size
2040 sub load_sizefield
() {
2041 my $fh_sizefield = util
::efopen
($var::NMZ
{'field'} . '.size');
2042 return [] unless defined $fh_sizefield;
2045 while (defined($line = <$fh_sizefield>)) {
2049 util
::fclose
($fh_sizefield) if (defined $fh_sizefield);
2053 sub get_total_keys
() {
2054 my $keys = get_status
("keys");
2055 $keys =~ s/,//g if (defined $keys);
2056 $keys = 0 unless defined $keys;
2060 sub get_total_files
() {
2061 my $files = get_status
("files");
2062 $files =~ s/,//g if (defined $files);
2063 $files = 0 unless defined $files;
2070 my $fh = util
::fopen
($var::NMZ
{'status'});
2071 return undef unless defined $fh;
2073 while (defined(my $line = <$fh>)) {
2074 if ($line =~ /^$key\s+(.*)$/) {
2075 util
::dprint
("status: $key = $1\n");
2080 util
::fclose
($fh) if (defined $fh);
2084 sub put_total_files
($) {
2087 put_status
("files", $number);
2090 sub put_total_keys
($) {
2093 put_status
("keys", $number);
2096 sub put_status
($$) {
2097 my ($key, $value) = @_;
2099 # remove NMZ.status file if the file has a previous value.
2100 unlink $var::NMZ
{'status'} if defined get_status
($key);
2102 my $fh = util
::efopen
(">> $var::NMZ{'status'}");
2103 print $fh "$key $value\n";
2108 sub put_log
($$$$) {
2109 my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
2111 my $date = localtime;
2112 my $added_files_count = $docid_count;
2113 my $deleted_documents_count = $DeletedFilesCount;
2114 my $updated_documents_count = $UpdatedFilesCount;
2115 my $total_files_count = get_total_files
() + $docid_count
2116 - $DeletedFilesCount - $UpdatedFilesCount;
2117 my $added_keys_count = 0;
2118 $added_keys_count = $total_keys_count - get_total_keys
();
2120 my $processtime = time - $start_time;
2121 $processtime = 0 if $start_time == 0;
2122 $total_files_size = $total_files_size;
2123 $total_keys_count = $total_keys_count;
2127 push @logmsgs, N_
("[Append]");
2129 push @logmsgs, N_
("[Base]");
2131 push @logmsgs, N_
("Date:"), "$date" if $date;
2132 push @logmsgs, N_
("Added Documents:"), util
::commas
("$added_files_count")
2133 if $added_files_count;
2134 push @logmsgs, N_
("Deleted Documents:"),
2135 util
::commas
("$deleted_documents_count") if $deleted_documents_count;
2136 push @logmsgs, N_
("Updated Documents:"),
2137 util
::commas
("$updated_documents_count") if $updated_documents_count;
2138 push @logmsgs, N_
("Size (bytes):"), util
::commas
("$total_files_size")
2139 if $total_files_size;
2140 push @logmsgs, N_
("Total Documents:"), util
::commas
("$total_files_count")
2141 if $total_files_count;
2142 push @logmsgs, N_
("Added Keywords:"), util
::commas
("$added_keys_count")
2143 if $added_keys_count;
2144 push @logmsgs, N_
("Total Keywords:"), util
::commas
("$total_keys_count")
2145 if $total_keys_count;
2146 push @logmsgs, N_
("Wakati:"), "$conf::WAKATI" if $conf::WAKATI
;
2147 push @logmsgs, N_
("Time (sec):"), util
::commas
("$processtime")
2149 push @logmsgs, N_
("File/Sec:"), sprintf "%.2f",
2150 (($added_files_count + $updated_documents_count) / $processtime)
2152 push @logmsgs, N_
("System:"), "$English::OSNAME" if $English::OSNAME
;
2153 push @logmsgs, N_
("Perl:"), sprintf("%f", $English::PERL_VERSION
);
2154 push @logmsgs, N_
("Namazu:"), "$var::VERSION" if $var::VERSION
;
2156 my $log_for_file = "";
2158 my $msg = shift @logmsgs; # [Base] or [Append]
2159 # To stdout, use gettext.
2160 print _
($msg), "\n";
2161 # To log file, do not use gettext.
2162 $log_for_file = $msg . "\n";
2164 my $field = shift @logmsgs;
2165 my $value = shift @logmsgs;
2166 printf "%-20s %s\n", _
($field), "$value";
2167 $log_for_file .= sprintf "%-20s %s\n", $field, "$value";
2170 $log_for_file .= "\n";
2172 put_log_to_logfile
($log_for_file);
2173 put_total_files
($total_files_count);
2174 put_total_keys
($total_keys_count);
2176 my $argv = join "\t", @ARGV;
2178 put_status
("argv", $argv);
2179 put_status
("cwd", $cwd);
2182 sub put_log_to_logfile
($) {
2184 my $fh_logfile = util
::efopen
(">>$var::NMZ{'log'}");
2185 print $fh_logfile $logmsg;
2186 util
::fclose
($fh_logfile);
2190 my $year = (localtime)[5] + 1900;
2195 # Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
2196 # Insert $docid_count, $key_count, and $month/$day/$year respectively.
2197 sub make_headfoot
($$$) {
2198 my ($file, $docid_count, $key_count) = @_;
2200 my $day = sprintf("%02d", (localtime)[3]);
2201 my $month = sprintf("%02d", (localtime)[4] + 1);
2202 my $year = get_year
();
2206 $buf = util
::readfile
($file);
2208 $file =~ m!.*/(.*)$!;
2210 my $template = "$TEMPLATEDIR/$fname";
2213 $buf = util
::readfile
($template);
2219 my $fh_file = util
::efopen
(">$file");
2221 if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
2222 my $total_files_count = util
::commas
(get_total_files
() + $docid_count
2223 - $DeletedFilesCount - $UpdatedFilesCount);
2224 $buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
2227 if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
2231 $tmp = util
::commas
($tmp);
2232 $buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
2234 my $index_dir = basename
($var::OUTPUT_DIR
);
2235 $buf =~ s
#<!-- INDEX_DIR -->#$index_dir#gs;
2236 $buf =~ s
#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
2237 $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
2238 $buf =~ s
{(<!-- ADDRESS
-->)(.*)(<!-- ADDRESS
-->)}
2239 {$1\n<a href
="mailto:$conf::ADDRESS">$conf::ADDRESS
</a
>\n$3}gs
;
2240 $buf =~ s
{(<!-- LINK
-REV
-MADE
-->)(.*)(<!-- LINK
-REV
-MADE
-->)}
2241 {$1\n<link rev
="made" href
="mailto:$conf::ADDRESS">\n$3}gs
;
2243 print $fh_file $buf;
2244 util
::fclose
($fh_file);
2247 # Make phrase hashes for NMZ.p
2248 # Handle two words each for calculating a hash value ranged 0-65535.
2249 sub make_phrase_hash
($$$) {
2250 my ($docid_count, $docid_base, $contref) = @_;
2253 $$contref =~ s!\x7f */? *\d+ *\x7f!!g; # remove tags of weight
2254 $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
2255 my @words = split(/\s+/, $$contref);
2256 @words = grep {$_ ne ""} @words; # remove empty words
2257 my $word_b = shift @words;
2258 my $docid = $docid_count + $docid_base;
2259 for my $word (@words) {
2260 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX
);
2261 my $hash = hash
($word_b . $word);
2262 unless (defined $tmp{$hash}) {
2264 $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
2265 $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
2266 # util::dprint("<$word_b, $word> $hash\n");
2267 $PhraseHashLast{$hash} = $docid;
2273 # Construct NMZ.p and NMZ.pi file. this processing is rather complex.
2274 sub write_phrase_hash
() {
2275 write_phrase_hash_sub
();
2276 util
::Rename
($var::NMZ
{'__p'}, $var::NMZ
{'_p'});
2277 util
::Rename
($var::NMZ
{'__pi'}, $var::NMZ
{'_pi'});
2280 sub write_phrase_hash_sub
() {
2283 return 0 if %PhraseHash eq ''; # namazu-devel-ja #3146
2284 util
::dprint
(_
("doing write_phrase_hash() processing.\n"));
2286 my $fh_tmp_pi = util
::efopen
(">$var::NMZ{'__pi'}");
2287 my $fh_tmp_p = util
::efopen
(">$var::NMZ{'__p'}");
2289 my $fh_phrase = util
::fopen
($var::NMZ
{'_p'});
2290 my $fh_phraseindex = undef;
2292 $fh_phraseindex = util
::efopen
($var::NMZ
{'_pi'});
2297 for (my $i = 0; $i < 65536; $i++) {
2299 my $baserecord = "";
2304 read($fh_phraseindex, $tmp, $var::INTSIZE
);
2305 $tmp = unpack("N", $tmp);
2306 if ($tmp != 0xffffffff) { # 0xffffffff
2307 $baseleng = readw
($fh_phrase);
2308 read($fh_phrase, $baserecord, $baseleng);
2311 if (defined($PhraseHash{$i})) {
2312 if ($baserecord eq "") {
2313 print $fh_tmp_pi pack("N", $ptr);
2314 my $record = $PhraseHash{$i};
2315 my $n2 = length($record);
2316 my $data = pack("w", $n2) . $record;
2317 print $fh_tmp_p $data;
2318 $ptr += length($data);
2320 print $fh_tmp_pi pack("N", $ptr);
2321 my $record = $PhraseHash{$i};
2322 my $last_docid = get_last_docid
($baserecord, 1);
2323 my $adjrecord = adjust_first_docid
($record, $last_docid);
2324 check_records
(\
$record, \
$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
2325 $record = $adjrecord;
2326 my $n2 = length($record) + $baseleng;
2327 my $data = pack("w", $n2) . $baserecord . $record;
2328 print $fh_tmp_p $data;
2329 $ptr += length($data);
2332 if ($baserecord eq "") {
2333 # if $baserecord has no data, set to 0xffffffff
2334 print $fh_tmp_pi pack("N", 0xffffffff);
2336 print $fh_tmp_pi pack("N", $ptr);
2337 my $data = pack("w", $baseleng) . $baserecord;
2338 print $fh_tmp_p $data;
2339 $ptr += length($data);
2345 util
::fclose
($fh_phraseindex);
2347 if (defined $fh_phrase) {
2348 util
::fclose
($fh_phrase);
2350 util
::fclose
($fh_tmp_p);
2351 util
::fclose
($fh_tmp_pi);
2354 %PhraseHashLast = ();
2357 # Dr. Knuth's ``hash'' from (UNIX MAGAZINE May 1998)
2362 for (my $i = 0; $word ne ""; $i++) {
2363 $hash ^= $Seed[$i & 0x03][ord($word)];
2364 $word = substr $word, 1;
2365 # $word =~ s/^.//; is slower
2367 return $hash & 65535;
2370 # Count frequencies of words.
2371 sub count_words
($$$$) {
2372 my ($docid_count, $docid_base, $contref, $kanji) = @_;
2375 # Normalize into small letter.
2376 $$contref =~ tr/A-Z/a-z/;
2378 # Remove control char.
2379 $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
2381 # It corresponds to -j option of ChaSen.
2382 $$contref =~ s/^[ \t\f]+//gm; # except "\r\n"
2383 $$contref =~ s/[ \t\f]+$//gm; # except "\r\n"
2384 $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi; # for hyphenation
2385 if (util
::islang
("ja")) {
2386 $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
2387 $$contref =~ s/(¡£|¡¢)/$1\n/gs;
2389 $$contref =~ s/\n+/\n/gs;
2391 # Do wakatigaki if necessary.
2392 if (util
::islang
("ja")) {
2393 wakati
::wakatize_japanese
($contref) if $kanji;
2398 if ($$contref =~ /\x7f/) {
2399 $part1 = substr $$contref, 0, index($$contref, "\x7f");
2400 $part2 = substr $$contref, index($$contref, "\x7f");
2401 # $part1 = $PREMATCH; # $& and friends are not efficient
2402 # $part2 = $MATCH . $POSTMATCH;
2409 my %word_count = ();
2410 $part2 =~ s
!\x7f *(\d
+) *\x7f([^\x7f]*)\x7f */ *\d
+ *\x7f!
2411 wordcount_sub
($2, $1, \
%word_count)!ge;
2412 wordcount_sub
($part1, 1, \
%word_count);
2414 # Add them to whole index
2415 my $docid = $docid_count + $docid_base;
2416 for my $word (keys(%word_count)) {
2417 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX
);
2418 $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
2419 $KeyIndex{$word} .= pack("w2",
2420 $docid - $KeyIndexLast{$word},
2421 $word_count{$word});
2422 $KeyIndexLast{$word} = $docid;
2427 # Count words and do score weighting
2429 sub wordcount_sub
($$\
%) {
2430 my ($text, $weight, $word_count) = @_;
2432 # Remove all symbols when -K option is specified.
2433 $text =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt
{'nosymbol'};
2435 # Count frequencies of words in a current document.
2436 # Handle symbols as follows.
2438 # tcp/ip -> tcp/ip, tcp, ip
2439 # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
2440 # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
2442 # Don't do processing for nested symbols.
2443 # NOTE: When -K is specified, all symbols are already removed.
2445 my @words = split /\s+/, $text;
2446 for my $word (@words) {
2447 next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX
);
2448 if ($var::Opt
{'noedgesymbol'}) {
2449 # remove symbols at both ends
2450 $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
2452 $word_count->{$word} = 0 unless defined($word_count->{$word});
2453 $word_count->{$word} += $weight;
2454 unless ($var::Opt
{'nosymbol'}) {
2455 if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
2456 $word_count->{$1} = 0 unless defined($word_count->{$1});
2457 $word_count->{$1} += $weight;
2458 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2459 } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
2460 $word_count->{$1} = 0 unless defined($word_count->{$1});
2461 $word_count->{$1} += $weight;
2462 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2463 } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
2464 $word_count->{$1} = 0 unless defined($word_count->{$1});
2465 $word_count->{$1} += $weight;
2466 next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2468 my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
2469 if $word =~ /[^\xa1-\xfea-z_0-9]/;
2470 for my $tmp (@words_) {
2472 $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
2473 $word_count->{$tmp} += $weight;
2481 # Construct NMZ.i and NMZ.ii file. this processing is rather complex.
2482 sub write_index
() {
2483 my $key_count = write_index_sub
();
2484 util
::Rename
($var::NMZ
{'__i'}, $var::NMZ
{'_i'});
2485 util
::Rename
($var::NMZ
{'__w'}, $var::NMZ
{'_w'});
2490 # readw: read one pack 'w' word.
2491 # This code was contributed by <furukawa@tcp-ip.or.jp>.
2497 while (read($fh, $c, 1)){
2499 last unless 0x80 & ord $c;
2501 return unpack('w', $ret);
2504 sub get_last_docid
($$) {
2505 my ($record, $step) = @_;
2506 my (@data) = unpack 'w*', $record;
2509 for (my $i = 0; $i < @data; $i += $step) {
2512 my $leng = @data / $step;
2516 sub adjust_first_docid
($$) {
2517 my ($record, $last_docid) = @_;
2518 my (@data) = unpack 'w*', $record;
2520 $data[0] = $data[0] - $last_docid;
2521 return undef if ($data[0] < 0); # namazu-bug-ja#31
2522 $record = pack 'w*', @data;
2526 sub write_index_sub
() {
2527 my @words = sort keys(%KeyIndex);
2528 return 0 if $#words == -1;
2534 my $baserecord = "";
2536 util
::dprint
(_
("doing write_index() processing.\n"));
2537 my $fh_tmp_i = util
::efopen
(">$var::NMZ{'__i'}");
2538 my $fh_tmp_w = util
::efopen
(">$var::NMZ{'__w'}");
2539 my $fh_i = util
::fopen
($var::NMZ
{'_i'});
2540 my $fh_ii = util
::efopen
(">$var::NMZ{'_ii'}");
2541 my $fh_w = util
::fopen
($var::NMZ
{'_w'});
2542 my $fh_wi = util
::efopen
(">$var::NMZ{'_wi'}");
2546 while (defined(my $line = <$fh_w>)) {
2548 my $current_word = $line;
2550 my $baseleng = readw
($fh_i);
2551 read($fh_i, $baserecord, $baseleng);
2553 for (; $cnt < @words; $cnt++) {
2554 last unless $words[$cnt] le $current_word;
2555 my $record = $KeyIndex{$words[$cnt]};
2556 my $leng = length($record);
2558 if ($current_word eq $words[$cnt]) {
2559 my $last_docid = get_last_docid
($baserecord, 2);
2560 my $adjrecord = adjust_first_docid
($record, $last_docid);
2561 check_records
(\
$record, \
$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
2562 $record = $adjrecord;
2563 $leng = length($record); # re-measure
2564 my $tmp = pack("w", $leng + $baseleng);
2566 my $data_i = "$tmp$baserecord$record";
2567 my $data_w = "$current_word\n";
2568 print $fh_tmp_i $data_i;
2569 print $fh_tmp_w $data_w;
2570 print $fh_ii pack("N", $ptr_i);
2571 print $fh_wi pack("N", $ptr_w);
2572 $ptr_i += length($data_i);
2573 $ptr_w += length($data_w);
2579 my $tmp = pack("w", $leng);
2580 my $data_i = "$tmp$record";
2581 my $data_w = "$words[$cnt]\n";
2582 print $fh_tmp_i $data_i;
2583 print $fh_tmp_w $data_w;
2584 print $fh_ii pack("N", $ptr_i);
2585 print $fh_wi pack("N", $ptr_w);
2586 $ptr_i += length($data_i);
2587 $ptr_w += length($data_w);
2591 my $tmp = pack("w", $baseleng);
2592 my $data_i = "$tmp$baserecord";
2593 my $data_w = "$current_word\n";
2594 print $fh_tmp_i $data_i;
2595 print $fh_tmp_w $data_w;
2596 print $fh_ii pack("N", $ptr_i);
2597 print $fh_wi pack("N", $ptr_w);
2598 $ptr_i += length($data_i);
2599 $ptr_w += length($data_w);
2603 while ($cnt < @words) {
2604 my $leng = length($KeyIndex{$words[$cnt]});
2605 my $tmp = pack("w", $leng);
2606 my $record = $KeyIndex{$words[$cnt]};
2608 my $data_i = "$tmp$record";
2609 my $data_w = "$words[$cnt]\n";
2610 print $fh_tmp_i $data_i;
2611 print $fh_tmp_w $data_w;
2612 print $fh_ii pack("N", $ptr_i);
2613 print $fh_wi pack("N", $ptr_w);
2614 $ptr_i += length($data_i);
2615 $ptr_w += length($data_w);
2622 util
::fclose
($fh_wi);
2623 util
::fclose
($fh_w) if (defined $fh_w);
2624 util
::fclose
($fh_ii);
2625 util
::fclose
($fh_i) if (defined $fh_i);
2626 util
::fclose
($fh_tmp_w);
2627 util
::fclose
($fh_tmp_i);
2633 # Decide the media type.
2634 # FIXME: Very ad hoc. It's just a compromise. -- satoru
2636 sub decide_type
($$) {
2637 my ($name, $cont) = @_;
2638 return $name if (!defined $cont || $name eq $cont);
2640 util
::dprint
("decide_type: name: $name, cont: $cont\n");
2641 if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
2643 } elsif ($cont =~ m!^application/octet-stream! &&
2644 $name !~ m!^text/!) {
2646 } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
2647 $name !~ m!^application/octet-stream!) {
2648 # FIXME: Currently File::MMagic 1.02's checktype_data()
2649 # is unreliable for them.
2651 } elsif ($cont =~ m!^application/x-zip! &&
2652 $name =~ m!^application/!) {
2653 # zip format is used other applications e.g. OpenOffice.
2654 # It is necessary to add to check extention.
2662 # Debugging code for the "negative numbers" problem.
2664 sub check_records
($$$) {
2665 my ($recref, $baserecref, $step) = @_;
2666 dump_record
($baserecref, $step);
2667 dump_record
($recref, $step);
2668 print STDERR
"The \x22negative number\x22 problem occurred.\n";
2672 sub dump_record
($$) {
2673 my ($recref, $step) = @_;
2674 my (@data) = unpack 'w*', $$recref;
2675 print STDERR
"dump record data to NMZ.bug.info (step: $step)...";
2676 my $fh_info = util
::fopen
(">> NMZ.bug.info");
2677 print $fh_info "dumped record data (step: $step)...";
2679 print $fh_info sprintf(" %08x", $_);
2681 print $fh_info "\n";
2682 util
::fclose
($fh_info);
2688 print STDERR
"Warning: signal $signame occured.\n";
2692 # For avoiding "used only once: possible typo at ..." warnings.
2694 muda
($conf::ON_MEMORY_MAX
,
2695 $conf::WORD_LENG_MAX
, $conf::TEXT_SIZE_MAX
,
2696 $conf::DENY_FILE
, $var::INTSIZE
,
2697 $conf::CHASEN_NOUN
, $conf::CHASEN
,
2698 $conf::KAKASI
, $var::Opt
{'okurigana'},
2699 $var::Opt
{'hiragana'}, $conf::DIRECTORY_INDEX
,
2700 $usage::USAGE
, $var::Opt
{'noheadabst'}, $usage::VERSION_INFO
,
2701 $var::Opt
{'noencodeurl'}, $conf::HTML_SUFFIX
,
2702 $var::RECURSIVE_ACTIONS
, $conf::META_TAGS
, $var::USE_NKF_MODULE
,
2703 $conf::ADDRESS
, $var::MAILING_ADDRESS
,
2704 $conf::FILE_SIZE_MAX
,