| 1 | #! /usr/bin/perl -w |
| 2 | # -*- Perl -*- |
| 3 | # mknmz - indexer of Namazu |
| 4 | # $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $ |
| 5 | # |
| 6 | # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. |
| 7 | # Copyright (C) 2000-2008 Namazu Project All rights reserved. |
| 8 | # This is free software with ABSOLUTELY NO WARRANTY. |
| 9 | # |
| 10 | # This program is free software; you can redistribute it and/or modify |
| 11 | # it under the terms of the GNU General Public License as published by |
| 12 | # the Free Software Foundation; either versions 2, or (at your option) |
| 13 | # any later version. |
| 14 | # |
| 15 | # This program is distributed in the hope that it will be useful |
| 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 18 | # GNU General Public License for more details. |
| 19 | # |
| 20 | # You should have received a copy of the GNU General Public License |
| 21 | # along with this program; if not, write to the Free Software |
| 22 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
| 23 | # 02111-1307, USA |
| 24 | # |
| 25 | # This file must be encoded in EUC-JP encoding |
| 26 | # |
| 27 | |
| 28 | package mknmz; |
| 29 | require 5.004; |
| 30 | use English; |
| 31 | use lib "."; |
| 32 | use Cwd; |
| 33 | use IO::File; |
| 34 | use File::Find; |
| 35 | use File::MMagic; |
| 36 | use Time::Local; |
| 37 | use strict; # be strict since v1.2.0 |
| 38 | use Getopt::Long; |
| 39 | use File::Copy; |
| 40 | use DirHandle; |
| 41 | use File::Basename; |
| 42 | |
| 43 | use vars qw($SYSTEM); |
| 44 | # It exists only for back compatibility. |
| 45 | $SYSTEM = $English::OSNAME; |
| 46 | |
| 47 | my $NAMAZU_INDEX_VERSION = "2.0"; |
| 48 | |
| 49 | my $CodingSystem = "euc"; |
| 50 | my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/share/namazu"; |
| 51 | my $CONFDIR = "/etc/namazu"; # directory where mknmzrc are in. |
| 52 | my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in. |
| 53 | my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in. |
| 54 | my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in. |
| 55 | |
| 56 | my $DeletedFilesCount = 0; |
| 57 | my $UpdatedFilesCount = 0; |
| 58 | my $APPENDMODE = 0; |
| 59 | my %PhraseHash = (); |
| 60 | my %PhraseHashLast = (); |
| 61 | my %KeyIndex = (); |
| 62 | my %KeyIndexLast = (); |
| 63 | my %CheckPoint = ("on" => undef, "continue" => undef); |
| 64 | my $ConfigFile = undef; |
| 65 | my $MediaType = undef; |
| 66 | |
| 67 | my $ReplaceCode = undef; # perl code for transforming URI |
| 68 | my @Seed = (); |
| 69 | my @LoadedRcfiles = (); |
| 70 | my $Magic = new File::MMagic; |
| 71 | |
| 72 | my $ReceiveTERM = 0; |
| 73 | |
| 74 | STDOUT->autoflush(1); |
| 75 | STDERR->autoflush(1); |
| 76 | main(); |
| 77 | sub main { |
| 78 | my $start_time = time; |
| 79 | |
| 80 | if ($English::PERL_VERSION == 5.008001) { |
| 81 | unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) { |
| 82 | print "Run mknmz with the environment variable PERL_HASH_SEED=0\n"; |
| 83 | exit 1; |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | init(); |
| 88 | |
| 89 | # At first, loading pl/conf.pl to prevent overriding some variables. |
| 90 | preload_modules(); |
| 91 | |
| 92 | # set LANG and bind textdomain |
| 93 | util::set_lang(); |
| 94 | textdomain('namazu', $util::LANG_MSG); |
| 95 | |
| 96 | load_modules(); |
| 97 | my ($output_dir, @targets) = parse_options(); |
| 98 | my ($docid_base, $total_files_num) = prep($output_dir, @targets); |
| 99 | |
| 100 | my $swap = 1; |
| 101 | my $docid_count = 0; |
| 102 | my $file_count = 0; |
| 103 | my $total_files_size = 0; |
| 104 | my $key_count = 0; |
| 105 | my $checkpoint = 0; |
| 106 | my $flist_ptr = 0; |
| 107 | my $processed_files_size = 0; |
| 108 | |
| 109 | if ($CheckPoint{'continue'}) { |
| 110 | # Restore variables |
| 111 | eval util::readfile($var::NMZ{'_checkpoint'}) ; |
| 112 | } else { |
| 113 | print $total_files_num . _(" files are found to be indexed.\n"); |
| 114 | } |
| 115 | |
| 116 | { |
| 117 | my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}"); |
| 118 | my $fh_flist = util::efopen($var::NMZ{'_flist'}); |
| 119 | my %field_indices = (); |
| 120 | get_field_index_base(\%field_indices); |
| 121 | |
| 122 | if ($CheckPoint{'continue'}) { |
| 123 | seek($fh_flist, $flist_ptr, 0); |
| 124 | } |
| 125 | |
| 126 | # Process target files one by one |
| 127 | while (defined(my $line = <$fh_flist>)) { |
| 128 | $flist_ptr += length($line); |
| 129 | my $cfile = $line; |
| 130 | chomp $cfile; |
| 131 | util::dprint(_("target file: ")."$cfile\n"); |
| 132 | |
| 133 | my ($cfile_size, $num) = |
| 134 | process_file($cfile, $docid_count, $docid_base, |
| 135 | $file_count, \%field_indices, |
| 136 | $fh_errorsfile, $total_files_num); |
| 137 | if ($num == 0) { |
| 138 | $total_files_num--; |
| 139 | next; |
| 140 | } else { |
| 141 | $docid_count += $num; |
| 142 | $file_count++; |
| 143 | } |
| 144 | |
| 145 | $total_files_size += $cfile_size; |
| 146 | $processed_files_size += $cfile_size; |
| 147 | last if $ReceiveTERM; |
| 148 | if ($processed_files_size > $conf::ON_MEMORY_MAX) { |
| 149 | if (%KeyIndex) { |
| 150 | $key_count = write_index(); |
| 151 | print _("Writing index files..."); |
| 152 | write_phrase_hash(); |
| 153 | print "\n"; |
| 154 | } |
| 155 | $processed_files_size = 0; |
| 156 | $checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>); |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | util::fclose($fh_flist); |
| 161 | util::fclose($fh_errorsfile); |
| 162 | } |
| 163 | # This should be out of above blocks because of file handler closing. |
| 164 | re_exec($flist_ptr, $docid_count, $docid_base, $start_time, |
| 165 | $total_files_size, $total_files_num, |
| 166 | $file_count, $key_count) if $checkpoint; |
| 167 | |
| 168 | if (%KeyIndex) { |
| 169 | $key_count = write_index(); |
| 170 | print _("Writing index files..."); |
| 171 | write_phrase_hash(); |
| 172 | print "\n"; |
| 173 | } |
| 174 | |
| 175 | $key_count = get_total_keys() unless $key_count; |
| 176 | do_remain_job($total_files_size, $docid_count, $key_count, |
| 177 | $start_time); |
| 178 | exit 0; |
| 179 | } |
| 180 | |
| 181 | # |
| 182 | # FIXME: Very complicated. |
| 183 | # |
| 184 | sub process_file ($$$$\%$$) { |
| 185 | my ($cfile, $docid_count, $docid_base, $file_count, |
| 186 | $field_indices, $fh_errorsfile, $total_files_num) = @_; |
| 187 | |
| 188 | my $processed_num = 0; |
| 189 | my $file_size = util::filesize($cfile); |
| 190 | |
| 191 | if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) { |
| 192 | my @parts; |
| 193 | @parts = htmlsplit::split($cfile, "NMZ.partial") |
| 194 | if ($file_size <= $conf::FILE_SIZE_MAX); |
| 195 | if (@parts > 1) { |
| 196 | my $id = 0; |
| 197 | for my $part (@parts) { |
| 198 | next if (defined $conf::EXCLUDE_PATH && |
| 199 | "$cfile#$part" =~ /$conf::EXCLUDE_PATH/); |
| 200 | my $fname = util::tmpnam("NMZ.partial.$id"); |
| 201 | my $fragment = defined $part ? $part : undef; |
| 202 | my $uri = generate_uri($cfile, $fragment); |
| 203 | my $result = namazu_core($fname, |
| 204 | $docid_count + $processed_num, |
| 205 | $docid_base, $file_count, |
| 206 | $field_indices, $fh_errorsfile, |
| 207 | $total_files_num, |
| 208 | $uri, $id, $#parts); |
| 209 | if ($result > 0) { |
| 210 | $processed_num++; |
| 211 | my $rname = defined $part ? "$cfile\t$part" : "$cfile"; |
| 212 | put_registry($rname); |
| 213 | } |
| 214 | unlink $fname; |
| 215 | $id++; |
| 216 | } |
| 217 | return ($file_size, $processed_num); |
| 218 | } |
| 219 | } |
| 220 | my $result = namazu_core($cfile, $docid_count, $docid_base, |
| 221 | $file_count, $field_indices, |
| 222 | $fh_errorsfile, $total_files_num, |
| 223 | undef, undef, undef); |
| 224 | if ($result > 0) { |
| 225 | $processed_num++; |
| 226 | put_registry($cfile); |
| 227 | } |
| 228 | |
| 229 | return ($file_size, $processed_num); |
| 230 | } |
| 231 | |
| 232 | # |
| 233 | # Load mknmzrcs: |
| 234 | # |
| 235 | # 1. MKNMZRC environment |
| 236 | # |
| 237 | # 2. $(sysconfdir)/$(PACKAGE)/mknmzrc |
| 238 | # |
| 239 | # 3. ~/.mknmzrc |
| 240 | # |
| 241 | # 4. user-specified mknmzrc set by mknmz --config=file option. |
| 242 | # |
| 243 | # If multiple files exists, read all of them. |
| 244 | # |
| 245 | sub load_rcfiles () { |
| 246 | my (@cand) = (); |
| 247 | |
| 248 | # To support Windows. Since they have nasty drive letter convention, |
| 249 | # it is necessary to change mknmzrc dynamically with env. variable. |
| 250 | push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'}; |
| 251 | push @cand, "$CONFDIR/mknmzrc"; |
| 252 | push @cand, "$ENV{'HOME'}/.mknmzrc"; |
| 253 | |
| 254 | util::vprint(_("Reading rcfile: ")); |
| 255 | for my $rcfile (@cand) { |
| 256 | if (-f $rcfile) { |
| 257 | load_rcfile ($rcfile); |
| 258 | util::vprint(" $rcfile"); |
| 259 | } |
| 260 | } |
| 261 | util::vprint("\n"); |
| 262 | } |
| 263 | |
| 264 | sub load_rcfile ($) { |
| 265 | my ($rcfile) = @_; |
| 266 | if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") { |
| 267 | util::win32_yen_to_slash(\$rcfile); |
| 268 | } |
| 269 | return if (grep {m/^$rcfile$/} @LoadedRcfiles); |
| 270 | do $rcfile; |
| 271 | if ($@) { |
| 272 | chop $@; |
| 273 | push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'"; |
| 274 | }else { |
| 275 | push @LoadedRcfiles, $rcfile; |
| 276 | } |
| 277 | |
| 278 | # Dirty workaround. |
| 279 | $LIBDIR = $conf::LIBDIR |
| 280 | if (defined $conf::LIBDIR && -d $conf::LIBDIR); |
| 281 | $FILTERDIR = $conf::FILTERDIR |
| 282 | if (defined $conf::FILTERDIR && -d $conf::FILTERDIR); |
| 283 | $TEMPLATEDIR = $conf::TEMPLATEDIR |
| 284 | if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR); |
| 285 | } |
| 286 | |
| 287 | sub re_exec($$$$$$$$) { |
| 288 | my ($flist_ptr, $docid_count, $docid_base, $start_time, |
| 289 | $total_files_size, $total_files_num, $file_count, $key_count) = @_; |
| 290 | |
| 291 | # store variables |
| 292 | { |
| 293 | my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}"); |
| 294 | |
| 295 | print $fh_checkpoint <<EOM; |
| 296 | \$DeletedFilesCount = $DeletedFilesCount; |
| 297 | \$UpdatedFilesCount = $UpdatedFilesCount; |
| 298 | \$APPENDMODE = $APPENDMODE; |
| 299 | \$flist_ptr = $flist_ptr; |
| 300 | \$docid_count = $docid_count; |
| 301 | \$docid_base = $docid_base; |
| 302 | \$start_time = $start_time; |
| 303 | \$total_files_size = $total_files_size; |
| 304 | \$total_files_num = $total_files_num; |
| 305 | \$key_count = $key_count; |
| 306 | \$file_count = $file_count; |
| 307 | \$\$ = $$; |
| 308 | EOM |
| 309 | util::fclose($fh_checkpoint); |
| 310 | } |
| 311 | |
| 312 | @ARGV = ("-S", @ARGV) ; |
| 313 | print _("Checkpoint reached: re-exec mknmz...\n"); |
| 314 | util::dprint(join ' ', ("::::", @ARGV, "\n")); |
| 315 | exec ($0, @ARGV) ; |
| 316 | } |
| 317 | |
| 318 | sub put_registry ($) { |
| 319 | my ($filename) = @_; |
| 320 | my $fh_registry = util::efopen(">>$var::NMZ{'_r'}"); |
| 321 | print $fh_registry $filename, "\n"; |
| 322 | util::fclose($fh_registry); |
| 323 | } |
| 324 | |
| 325 | |
| 326 | # Initialization |
| 327 | # $CodingSystem: Character Coding System 'euc' or 'sjis' |
| 328 | sub init () { |
| 329 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 330 | $CodingSystem = "sjis"; |
| 331 | if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { |
| 332 | $CONFDIR = $1 . $CONFDIR ; |
| 333 | } |
| 334 | if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { |
| 335 | $LIBDIR = $1 . $LIBDIR ; |
| 336 | } |
| 337 | if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { |
| 338 | $FILTERDIR = $1 . $FILTERDIR ; |
| 339 | } |
| 340 | if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) { |
| 341 | $TEMPLATEDIR = $1 . $TEMPLATEDIR ; |
| 342 | } |
| 343 | } else { |
| 344 | $CodingSystem = "euc"; |
| 345 | } |
| 346 | |
| 347 | $SIG{'INT'} = sub { |
| 348 | util::cdie("SIGINT caught! Aborted.\n"); |
| 349 | }; |
| 350 | |
| 351 | $SIG{'TERM'} = sub { |
| 352 | print STDERR "SIGTERM caught!\n"; |
| 353 | $ReceiveTERM = 1; |
| 354 | }; |
| 355 | } |
| 356 | |
| 357 | sub preload_modules () { |
| 358 | unshift @INC, $LIBDIR; |
| 359 | # workaround for test suites. |
| 360 | unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'}; |
| 361 | |
| 362 | require "var.pl" || die "unable to require \"var.pl\"\n"; |
| 363 | require "conf.pl" || die "unable to require \"conf.pl\"\n"; |
| 364 | require "util.pl" || die "unable to require \"util.pl\"\n"; |
| 365 | require "gettext.pl" || die "unable to require \"gettext.pl\"\n"; |
| 366 | require "ext.pl" || die "unable to require \"ext.pl\"\n"; |
| 367 | } |
| 368 | |
| 369 | sub postload_modules () { |
| 370 | require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n"; |
| 371 | } |
| 372 | |
| 373 | sub load_modules () { |
| 374 | require "usage.pl" || die "unable to require \"usage.pl\"\n"; |
| 375 | require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n"; |
| 376 | require "wakati.pl" || die "unable to require \"wakati.pl\"\n"; |
| 377 | require "seed.pl" || die "unable to require \"seed.pl\"\n"; |
| 378 | require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n"; |
| 379 | |
| 380 | @Seed = seed::init(); |
| 381 | } |
| 382 | |
| 383 | sub load_filtermodules () { |
| 384 | unshift @INC, $FILTERDIR; |
| 385 | |
| 386 | # |
| 387 | # Windows modules must be loaded first. |
| 388 | # Because OLE filters have low precedence over normal ones. |
| 389 | # |
| 390 | load_win32modules() if $English::OSNAME eq "MSWin32"; |
| 391 | |
| 392 | # Check filter modules |
| 393 | my @filters = (); |
| 394 | @filters = glob "$FILTERDIR/*.pl"; |
| 395 | |
| 396 | load_filters(@filters); |
| 397 | } |
| 398 | |
| 399 | sub load_win32modules () { |
| 400 | # Check filter modules |
| 401 | my @filters = (); |
| 402 | if (-f "../filter/win32/olemsword.pl") { # to ease developing |
| 403 | @filters = glob "../filter/win32/*.pl"; |
| 404 | unshift @INC, "../filter/win32"; |
| 405 | } else { |
| 406 | @filters = glob "$FILTERDIR/win32/*.pl"; |
| 407 | unshift @INC, "$FILTERDIR/win32"; |
| 408 | } |
| 409 | |
| 410 | load_filters(@filters); |
| 411 | } |
| 412 | |
| 413 | sub load_filters (@) { |
| 414 | my @filters = @_; |
| 415 | |
| 416 | for my $filter (@filters) { |
| 417 | $filter =~ m!([-\w]+)\.pl$!; |
| 418 | my $module = $1; |
| 419 | require "$module.pl" || die "unable to require \"$module.pl\"\n";; |
| 420 | my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv); |
| 421 | |
| 422 | eval "\@mtypes = ${module}::mediatype();"; |
| 423 | die $@ if $@; # eval error |
| 424 | eval "\$status = ${module}::status();"; |
| 425 | die $@ if $@; |
| 426 | eval "\$recursive = ${module}::recursive();"; |
| 427 | die $@ if $@; |
| 428 | eval "\$pre_codeconv = ${module}::pre_codeconv();"; |
| 429 | die $@ if $@; |
| 430 | eval "\$post_codeconv = ${module}::post_codeconv();"; |
| 431 | die $@ if $@; |
| 432 | eval "${module}::add_magic(\$Magic);"; |
| 433 | die $@ if $@; |
| 434 | |
| 435 | for my $mt (@mtypes) { |
| 436 | next if (defined $var::Supported{$mt} && |
| 437 | $var::Supported{$mt} eq 'yes' && $status eq 'no'); |
| 438 | $var::Supported{$mt} = $status; |
| 439 | $var::REQUIRE_ACTIONS{$mt} = $module; |
| 440 | $var::RECURSIVE_ACTIONS{$mt} = $recursive; |
| 441 | $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv; |
| 442 | $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv; |
| 443 | } |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | # Core routine. |
| 448 | # |
| 449 | # FIXME: Too many parameters. They must be cleared. |
| 450 | # |
| 451 | sub namazu_core ($$$$$$$$$$) { |
| 452 | my ($cfile, $docid_count, $docid_base, |
| 453 | $file_count, $field_indices, $fh_errorsfile, $total_files_num, |
| 454 | $uri, $part_id, $part_num) = @_; |
| 455 | |
| 456 | my $headings = ""; |
| 457 | my $content = ""; |
| 458 | my $weighted_str = ""; |
| 459 | my %fields; |
| 460 | my $msg_prefix; |
| 461 | |
| 462 | if ($part_id) { |
| 463 | $msg_prefix = " $part_id/$part_num - "; |
| 464 | } else { |
| 465 | $msg_prefix = $file_count + 1 . "/$total_files_num - "; |
| 466 | } |
| 467 | |
| 468 | unless ($uri) { |
| 469 | $uri = generate_uri($cfile); # Make a URI from a file name. |
| 470 | } |
| 471 | my ($cfile_size, $text_size, $kanji, $mtype) = |
| 472 | load_document(\$cfile, \$content, \$weighted_str, |
| 473 | \$headings, \%fields); |
| 474 | |
| 475 | { |
| 476 | $fields{'mtime'} = (stat($cfile))[9]; |
| 477 | my $utc = $fields{'mtime'}; |
| 478 | $utc = time::rfc822time_to_mtime($fields{'date'}) |
| 479 | if (defined $fields{'date'}); |
| 480 | if ($utc == -1) { |
| 481 | my $date = $fields{'date'}; |
| 482 | print "$cfile Illegal date format. : $date\n"; |
| 483 | print $fh_errorsfile "$cfile Illegal date format. : $date\n"; |
| 484 | $utc = $fields{'mtime'}; |
| 485 | delete $fields{'date'}; |
| 486 | } |
| 487 | $fields{'utc'} = $utc; |
| 488 | } |
| 489 | |
| 490 | util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n"); |
| 491 | |
| 492 | # Check if the file is acceptable. |
| 493 | my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri); |
| 494 | if (defined $err) { |
| 495 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 496 | my $uri2 = codeconv::eucjp_to_shiftjis($uri); |
| 497 | print $msg_prefix . "$uri2 $err\n"; |
| 498 | } else { |
| 499 | print $msg_prefix . "$uri $err\n"; |
| 500 | } |
| 501 | print $fh_errorsfile "$cfile $err\n"; |
| 502 | return 0; # return 0 if error |
| 503 | } |
| 504 | |
| 505 | # Print processing file name as URI. |
| 506 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 507 | my $uri2 = codeconv::eucjp_to_shiftjis($uri); |
| 508 | print $msg_prefix . "$uri2 [$mtype]\n"; |
| 509 | } else { |
| 510 | print $msg_prefix . "$uri [$mtype]\n"; |
| 511 | } |
| 512 | |
| 513 | # Add filename. |
| 514 | my $filename = defined $cfile ? $cfile : ''; |
| 515 | codeconv::toeuc(\$filename); |
| 516 | $filename = basename($filename); |
| 517 | $fields{'filename'} = $filename; |
| 518 | |
| 519 | complete_field_info(\%fields, $cfile, $uri, |
| 520 | \$headings, \$content, \$weighted_str); |
| 521 | put_field_index(\%fields, $field_indices); |
| 522 | |
| 523 | put_dateindex($cfile); |
| 524 | $content .= "\n\n$filename\n\n"; # add filename |
| 525 | $content .= $weighted_str; # add weights |
| 526 | count_words($docid_count, $docid_base, \$content, $kanji); |
| 527 | make_phrase_hash($docid_count, $docid_base, \$content); |
| 528 | |
| 529 | # assertion |
| 530 | util::assert($cfile_size != 0, |
| 531 | "cfile_size == 0 at the end of namazu_core."); |
| 532 | |
| 533 | return $cfile_size; |
| 534 | } |
| 535 | |
| 536 | # |
| 537 | # Make the URI from the given file name. |
| 538 | # |
| 539 | sub generate_uri (@) { |
| 540 | my ($file, $fragment) = @_; |
| 541 | return "" unless defined $file; |
| 542 | |
| 543 | # omit a file name if omittable |
| 544 | $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o; |
| 545 | |
| 546 | if (defined $ReplaceCode) { |
| 547 | # transforming URI by evaling |
| 548 | $_ = $file; |
| 549 | eval $ReplaceCode; |
| 550 | $file = $_; |
| 551 | } |
| 552 | |
| 553 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 554 | $file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C| |
| 555 | } |
| 556 | |
| 557 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 558 | $file = codeconv::shiftjis_to_eucjp($file); |
| 559 | } |
| 560 | if (defined $fragment) { |
| 561 | codeconv::toeuc(\$fragment); |
| 562 | } |
| 563 | |
| 564 | unless ($var::Opt{'noencodeuri'}) { |
| 565 | for my $tmp ($file, $fragment) { |
| 566 | next unless defined $tmp; |
| 567 | |
| 568 | # Escape unsafe characters (not strict) |
| 569 | $tmp =~ s/\%/%25/g; # Convert original '%' into '%25' v1.1.1.2 |
| 570 | $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/ |
| 571 | sprintf("%%%02X",ord($1))/ge; |
| 572 | } |
| 573 | } |
| 574 | |
| 575 | |
| 576 | my $uri = $file; |
| 577 | $uri .= "#" . $fragment if defined $fragment; |
| 578 | if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) { |
| 579 | # restore '|' for drive letter rule of Win32, OS/2 |
| 580 | $uri =~ s!^/([A-Z])%7C!/$1|!i; |
| 581 | } |
| 582 | return $uri; |
| 583 | } |
| 584 | |
| 585 | |
| 586 | sub get_field_index_base (\%) { |
| 587 | my ($field_indices) = @_; |
| 588 | |
| 589 | my @keys = split('\|', $conf::SEARCH_FIELD); |
| 590 | if ($var::Opt{'meta'}) { |
| 591 | push @keys, (split '\|', $conf::META_TAGS); |
| 592 | } |
| 593 | for my $key (@keys) { |
| 594 | $key = lc($key); |
| 595 | my $fname = "$var::NMZ{'field'}.$key"; |
| 596 | my $tmp_fname = util::tmpnam("NMZ.field.$key"); |
| 597 | my $size = 0; |
| 598 | $size = -s $fname if -f $fname; |
| 599 | $size += -s $tmp_fname if -f $tmp_fname; |
| 600 | $field_indices->{$key} = $size; |
| 601 | } |
| 602 | } |
| 603 | |
| 604 | sub complete_field_info (\%$$\$\$\$) { |
| 605 | my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_; |
| 606 | |
| 607 | for my $field (keys %{$fields}) { |
| 608 | if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) { |
| 609 | delete $fields->{$field}; |
| 610 | } |
| 611 | } |
| 612 | |
| 613 | unless (defined($fields->{'title'})) { |
| 614 | $fields->{'title'} = gfilter::filename_to_title($cfile, $wsref); |
| 615 | } |
| 616 | unless (defined($fields->{'date'})) { |
| 617 | my $mtime = $fields->{'mtime'}; |
| 618 | my $date = util::rfc822time($mtime); |
| 619 | $fields->{'date'} = $date; |
| 620 | } |
| 621 | unless (defined($fields->{'uri'})) { |
| 622 | $fields->{'uri'} = $uri; |
| 623 | } |
| 624 | unless (defined($fields->{'size'})) { |
| 625 | $fields->{'size'} = -s $cfile; |
| 626 | } |
| 627 | unless (defined($fields->{'summary'})) { |
| 628 | $fields->{'summary'} = make_summary($contref, $headings, $cfile); |
| 629 | } |
| 630 | unless (defined($fields->{'from'}) || defined($fields->{'author'})) { |
| 631 | $fields->{'from'} = getmsg("unknown"); |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | # |
| 636 | # Currently, messages for NMZ.* files should be encoded in |
| 637 | # EUC-JP currently. We cannot use gettext.pl for the messsage |
| 638 | # because gettext.pl may use Shift_JIS encoded messages. |
| 639 | # So, we should use the function instead of gettext(). |
| 640 | # |
| 641 | # FIXME: Ad hoc impl. getmsg() is effective only for "unknown". |
| 642 | # |
| 643 | sub getmsg($) { |
| 644 | my ($msg) = @_; |
| 645 | |
| 646 | if (util::islang_msg("ja")) { |
| 647 | if ($msg eq "unknown") { |
| 648 |