utf 8 on index pages, rt 1443213
[mharc.git] / cgi-bin / mknmzrc.in.dist
1 # $Id: mknmzrc.in.dist,v 1.3 2002/03/06 22:47:29 ehood Exp $
2 # This is a Namazu configuration file for mknmz.
3 #
4 package conf; # Don't remove this line!
5
6 #===================================================================
7 #
8 # Administrator's email address
9 #
10 $ADDRESS = '@@ADMIN_ADDRESS@@';
11
12
13 #===================================================================
14 #
15 # Regular Expression Patterns
16 #
17
18 #
19 # This pattern specifies HTML suffixes.
20 #
21 $HTML_SUFFIX = "html|[ps]html";
22
23 #
24 # This pattern specifies file names which will be targeted.
25 # NOTE: It can be specified by --allow=regex option.
26 # Do NOT use `$' or `^' anchors.
27 # Case-insensitive.
28 #
29 $ALLOW_FILE = ".*\\.(?:$HTML_SUFFIX)|.*\\.txt" . # HTML, plain text
30 "|.*\\.gz|.*\\.Z|.*\\.bz2" . # Compressed files
31 "|\\d+|[-\\w]+\\.[1-9n]"; # Mail/News, man
32
33 #
34 # This pattern specifies file names which will NOT be targeted.
35 # NOTE: It can be specified by --deny=regex option.
36 # Do NOT use `$' or `^' anchors.
37 # Case-insensitive.
38 #
39 $DENY_FILE = "(index|threads)\\.html|.*\\.(gif|png|jpg|jpeg)|.*\\.tar\\.gz|core|.*\\.bak|.*~|\\..*|\x23.*";
40
41 #
42 # This pattern specifies PATHNAMEs which will NOT be targeted.
43 # NOTE: Usually specified by --exclude=regex option.
44 #
45 # $EXCLUDE_PATH = undef;
46
47 #
48 # This pattern specifies file names which can be omitted
49 # in URI. e.g., 'index.html|index.htm|Default.html'
50 #
51 # NOTE: This is similar to Apache's "DirectoryIndex" directive.
52 #
53 # $DIRECTORY_INDEX = "";
54
55 #
56 # This pattern specifies Mail/News's fields in its header which
57 # should be searchable. NOTE: case-insensitive
58 #
59 # $REMAIN_HEADER = "From|Date|Message-ID";
60
61 #
62 # This pattern specifies fields which used for field-specified
63 # searching. NOTE: case-insensitive
64 #
65 # $SEARCH_FIELD = "message-id|subject|from|date|uri|newsgroups|to|summary|size";
66
67 #
68 # This pattern specifies meta tags which used for field-specified
69 # searching. NOTE: case-insensitive
70 #
71 # $META_TAGS = "keywords|description";
72
73 #
74 # This pattern specifies aliases for NMZ.field.* files.
75 # NOTE: Editing NOT recommended.
76 #
77 # %FIELD_ALIASES = ('title' => 'subject', 'author' => 'from');
78
79 #
80 # This pattern specifies HTML elements which should be replaced with
81 # null string when removing them. Normally, the elements are replaced
82 # with a single space character.
83 #
84 # $NON_SEPARATION_ELEMENTS = 'A|TT|CODE|SAMP|KBD|VAR|B|STRONG|I|EM|CITE|FONT|U|'.
85 # 'STRIKE|BIG|SMALL|DFN|ABBR|ACRONYM|Q|SUB|SUP|SPAN|BDO';
86
87 #===================================================================
88 #
89 # Critical Numbers
90 #
91
92 #
93 # The max size of files which can be loaded in memory at once.
94 # If you have much memory, you can increase the value.
95 # If you have less memory, you can decrease the value.
96 #
97 $ON_MEMORY_MAX = 5000000;
98
99 #
100 # The max file size for indexing. Files larger than this
101 # will be ignored.
102 # NOTE: This value is usually larger than TEXT_SIZE_MAX because
103 # binary-formated files such as PDF, Word are larger.
104 #
105 $FILE_SIZE_MAX = 500000;
106
107 #
108 # The max text size for indexing. Files larger than this
109 # will be ignored.
110 #
111 $TEXT_SIZE_MAX = 100000;
112
113 #
114 # The max length of a word. the word longer than this will be ignored.
115 #
116 $WORD_LENG_MAX = 40;
117
118
119 #
120 # Weights for HTML elements which are used for term weightning.
121 #
122 # %Weight =
123 # (
124 # 'html' => {
125 # 'title' => 16,
126 # 'h1' => 8,
127 # 'h2' => 7,
128 # 'h3' => 6,
129 # 'h4' => 5,
130 # 'h5' => 4,
131 # 'h6' => 3,
132 # 'a' => 4,
133 # 'strong' => 2,
134 # 'em' => 2,
135 # 'kbd' => 2,
136 # 'samp' => 2,
137 # 'var' => 2,
138 # 'code' => 2,
139 # 'cite' => 2,
140 # 'abbr' => 2,
141 # 'acronym'=> 2,
142 # 'dfn' => 2,
143 # },
144 # 'metakey' => 32, # for <meta name="keywords" content="foo bar">
145 # 'headers' => 8, # for Mail/News' headers
146 # );
147
148 #
149 # The max length of a HTML-tagged string which can be processed for
150 # term weighting.
151 # NOTE: There are not a few people has a bad manner using
152 # <h[1-6]> for changing a font size.
153 #
154 # $INVALID_LENG = 128;
155
156 #
157 # The max length of a field.
158 # This MUST be smaller than libnamazu.h's BUFSIZE (usually 1024).
159 #
160 # $MAX_FIELD_LENGTH = 200;
161
162
163 #===================================================================
164 #
165 # Softwares for handling a Japanese text
166 #
167
168 #
169 # Network Kanji Filter nkf v1.62 or later
170 #
171 # $NKF = "no";
172
173 #
174 # KAKASI
175 #
176 # $KAKASI = "no -ieuc -oeuc -w";
177
178 #
179 # ChaSen 1.51 or later (simple wakatigaki)
180 #
181 # $CHASEN = "no -j -F '\%m '";
182
183 #
184 # ChaSen 1.51 or later (with noun words extraction)
185 #
186 # $CHASEN_NOUN = "no -j -F '\%m %H\\n'";
187
188 #
189 # Default Japanese processer: KAKASI or ChaSen.
190 #
191 # $WAKATI = $none;
192
193
194 #===================================================================
195 #
196 # Directories
197 #
198 # $LIBDIR = "@PERLLIBDIR@";
199 # $FILTERDIR = "@FILTERDIR@";
200 # $TEMPLATEDIR = "@TEMPLATEDIR@";
201
202 # 1;
203