document branches better
[mharc.git] / cgi-bin / mknmzrc
CommitLineData
01c223d0
BOFG
1# $Id: mknmzrc.in.dist,v 1.3 2002/03/06 22:47:29 ehood Exp $
2# This is a Namazu configuration file for mknmz.
3#
4package conf; # Don't remove this line!
5
6#===================================================================
7#
8# Administrator's email address
9#
10$ADDRESS = 'sysadmin@gnu.org';
11
12
13#===================================================================
14#
15# Regular Expression Patterns
16#
17
18#
19# This pattern specifies HTML suffixes.
20#
21$HTML_SUFFIX = "html|[ps]html";
22
23#
24# This pattern specifies file names which will be targeted.
25# NOTE: It can be specified by --allow=regex option.
26# Do NOT use `$' or `^' anchors.
27# Case-insensitive.
28#
29$ALLOW_FILE = ".*\\.(?:$HTML_SUFFIX)|.*\\.txt" . # HTML, plain text
30 "|.*\\.gz|.*\\.Z|.*\\.bz2" . # Compressed files
31 "|\\d+|[-\\w]+\\.[1-9n]"; # Mail/News, man
32
33#
34# This pattern specifies file names which will NOT be targeted.
35# NOTE: It can be specified by --deny=regex option.
36# Do NOT use `$' or `^' anchors.
37# Case-insensitive.
38#
39$DENY_FILE = "(index|threads)\\.html|.*\\.(gif|png|jpg|jpeg)|.*\\.tar\\.gz|core|.*\\.bak|.*~|\\..*|\x23.*";
40
41#
42# This pattern specifies PATHNAMEs which will NOT be targeted.
43# NOTE: Usually specified by --exclude=regex option.
44#
45# $EXCLUDE_PATH = undef;
46
47#
48# This pattern specifies file names which can be omitted
49# in URI. e.g., 'index.html|index.htm|Default.html'
50#
51# NOTE: This is similar to Apache's "DirectoryIndex" directive.
52#
53# $DIRECTORY_INDEX = "";
54
55#
56# This pattern specifies Mail/News's fields in its header which
57# should be searchable. NOTE: case-insensitive
58#
59# $REMAIN_HEADER = "From|Date|Message-ID";
60
61#
62# This pattern specifies fields which used for field-specified
63# searching. NOTE: case-insensitive
64#
65# $SEARCH_FIELD = "message-id|subject|from|date|uri|newsgroups|to|summary|size";
66
67#
68# This pattern specifies meta tags which used for field-specified
69# searching. NOTE: case-insensitive
70#
71# $META_TAGS = "keywords|description";
72
73#
74# This pattern specifies aliases for NMZ.field.* files.
75# NOTE: Editing NOT recommended.
76#
77# %FIELD_ALIASES = ('title' => 'subject', 'author' => 'from');
78
79#
80# This pattern specifies HTML elements which should be replaced with
81# null string when removing them. Normally, the elements are replaced
82# with a single space character.
83#
84# $NON_SEPARATION_ELEMENTS = 'A|TT|CODE|SAMP|KBD|VAR|B|STRONG|I|EM|CITE|FONT|U|'.
85# 'STRIKE|BIG|SMALL|DFN|ABBR|ACRONYM|Q|SUB|SUP|SPAN|BDO';
86
87#===================================================================
88#
89# Critical Numbers
90#
91
92#
93# The max size of files which can be loaded in memory at once.
94# If you have much memory, you can increase the value.
95# If you have less memory, you can decrease the value.
96#
97$ON_MEMORY_MAX = 5000000;
98
99#
100# The max file size for indexing. Files larger than this
101# will be ignored.
102# NOTE: This value is usually larger than TEXT_SIZE_MAX because
103# binary-formated files such as PDF, Word are larger.
104#
105$FILE_SIZE_MAX = 500000;
106
107#
108# The max text size for indexing. Files larger than this
109# will be ignored.
110#
111$TEXT_SIZE_MAX = 100000;
112
113#
114# The max length of a word. the word longer than this will be ignored.
115#
116$WORD_LENG_MAX = 40;
117
118
119#
120# Weights for HTML elements which are used for term weightning.
121#
122# %Weight =
123# (
124# 'html' => {
125# 'title' => 16,
126# 'h1' => 8,
127# 'h2' => 7,
128# 'h3' => 6,
129# 'h4' => 5,
130# 'h5' => 4,
131# 'h6' => 3,
132# 'a' => 4,
133# 'strong' => 2,
134# 'em' => 2,
135# 'kbd' => 2,
136# 'samp' => 2,
137# 'var' => 2,
138# 'code' => 2,
139# 'cite' => 2,
140# 'abbr' => 2,
141# 'acronym'=> 2,
142# 'dfn' => 2,
143# },
144# 'metakey' => 32, # for <meta name="keywords" content="foo bar">
145# 'headers' => 8, # for Mail/News' headers
146# );
147
148#
149# The max length of a HTML-tagged string which can be processed for
150# term weighting.
151# NOTE: There are not a few people has a bad manner using
152# <h[1-6]> for changing a font size.
153#
154# $INVALID_LENG = 128;
155
156#
157# The max length of a field.
158# This MUST be smaller than libnamazu.h's BUFSIZE (usually 1024).
159#
160# $MAX_FIELD_LENGTH = 200;
161
162
163#===================================================================
164#
165# Softwares for handling a Japanese text
166#
167
168#
169# Network Kanji Filter nkf v1.62 or later
170#
171# $NKF = "no";
172
173#
174# KAKASI
175#
176# $KAKASI = "no -ieuc -oeuc -w";
177
178#
179# ChaSen 1.51 or later (simple wakatigaki)
180#
181# $CHASEN = "no -j -F '\%m '";
182
183#
184# ChaSen 1.51 or later (with noun words extraction)
185#
186# $CHASEN_NOUN = "no -j -F '\%m %H\\n'";
187
188#
189# Default Japanese processer: KAKASI or ChaSen.
190#
191# $WAKATI = $none;
192
193
194#===================================================================
195#
196# Directories
197#
198# $LIBDIR = "@PERLLIBDIR@";
199# $FILTERDIR = "@FILTERDIR@";
200# $TEMPLATEDIR = "@TEMPLATEDIR@";
201
202# 1;
203