1
24
25
27 package weblech.spider;
28
29 import weblech.util.Logger;
30
31 import java.io.File;
32 import java.io.Serializable;
33 import java.util.*;
34 import java.net.URL;
35 import java.net.MalformedURLException;
36
37 public class SpiderConfig extends Logger implements Serializable
38 {
39 private File saveRootDirectory;
40 private File mailtoLogFile;
41
42 private boolean refreshHTMLs;
43 private boolean refreshImages;
44 private boolean refreshOthers;
45
46 private Set htmlExtensions;
47 private Set imageExtensions;
48
49 private URL startLocation;
50 private String urlMatch;
51
52 private List interestingURLSubstrings;
53 private List boringURLSubstrings;
54
55 private boolean depthFirst;
56 private int maxDepth;
57
58 private String userAgent;
59
60 private String basicAuthUser;
61 private String basicAuthPassword;
62
63 private int spiderThreads;
64
65 private long checkpointInterval;
66
67
70 public SpiderConfig()
71 {
72 _logClass.debug("SpiderConfig()");
73
74 saveRootDirectory = new File(".");
75 mailtoLogFile = new File("mailto.txt");
76
77 refreshHTMLs = true;
78 refreshImages = false;
79 refreshOthers = false;
80
81 htmlExtensions = new HashSet();
82 htmlExtensions.add("htm");
83 htmlExtensions.add("html");
84 htmlExtensions.add("shtml");
85
86 imageExtensions = new HashSet();
87 imageExtensions.add("jpg");
88 imageExtensions.add("gif");
89 imageExtensions.add("png");
90
91 urlMatch = null;
92 interestingURLSubstrings = new ArrayList();
93 boringURLSubstrings = new ArrayList();
94 depthFirst = false;
95 maxDepth = 0;
96
97 userAgent = "WebLech Spider 0.01alpha";
98 basicAuthUser = "";
99 basicAuthPassword = "";
100
101 spiderThreads = 1;
102
103 checkpointInterval = 0;
104 }
105
106
109 public SpiderConfig(Properties props)
110 {
111 _logClass.debug("SpiderConfig(props)");
112
113 saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
114 if(!saveRootDirectory.exists())
115 {
116 if(!saveRootDirectory.mkdirs())
117 {
118 _logClass.error("Couldn't create root directory: " + saveRootDirectory);
119 _logClass.info("Defaulting to . instead");
120 saveRootDirectory = new File(".");
121 }
122 }
123 else if(!saveRootDirectory.isDirectory())
124 {
125 _logClass.error("Save root is not a directory: " + saveRootDirectory);
126 _logClass.info("Defaulting to . instead");
127 saveRootDirectory = new File(".");
128 }
129
130 String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
131 if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
133 {
134 _logClass.debug("Using absolute file name " + mailtoFileStr);
135 mailtoLogFile = new File(mailtoFileStr);
136 }
137 else
138 {
139 _logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
140 mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
141 }
142
143 refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
144 refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
145 refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
146
147 htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
148 imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
149
150 String startLocStr = props.getProperty("startLocation");
151 if(startLocStr != null)
152 {
153 try
154 {
155 startLocation = new URL(startLocStr);
156 }
157 catch(MalformedURLException murle)
158 {
159 _logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
160 }
161 }
162 else
163 {
164 _logClass.warn("startLocation not found in properties");
165 }
166
167 urlMatch = props.getProperty("urlMatch");
168
169 interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
170 boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
171
172 depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
173 try
174 {
175 String maxDepthStr = props.getProperty("maxDepth", "0");
176 maxDepth = Integer.parseInt(maxDepthStr);
177 }
178 catch(NumberFormatException nfe)
179 {
180 _logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
181 maxDepth = 1;
182 }
183
184 userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha");
185 basicAuthUser = props.getProperty("basicAuthUser", "");
186 basicAuthPassword = props.getProperty("basicAuthPassword", "");
187
188 try
189 {
190 String threadsStr = props.getProperty("spiderThreads", "1");
191 spiderThreads = Integer.parseInt(threadsStr);
192 }
193 catch(NumberFormatException nfe)
194 {
195 _logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
196 spiderThreads = 1;
197 }
198
199 try
200 {
201 String intervalStr = props.getProperty("checkpointInterval", "0");
202 checkpointInterval = Long.parseLong(intervalStr);
203 }
204 catch(NumberFormatException nfe)
205 {
206 _logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
207 spiderThreads = 1;
208 }
209 }
210
211 private List parsePropCommaSeparated(String str)
212 {
213 ArrayList result = new ArrayList();
214 if(str != null && str.length() > 0)
215 {
216 StringTokenizer tok = new StringTokenizer(str, ",");
217 while(tok.hasMoreTokens())
218 {
219 result.add(tok.nextToken());
220 }
221 }
222 return result;
223 }
224
225
226 public void setRefreshHTMLs(boolean refreshHTMLs)
227 {
228 this.refreshHTMLs = refreshHTMLs;
229 }
230
231 public boolean refreshHTMLs()
232 {
233 return refreshHTMLs;
234 }
235
236 public void setRefreshImages(boolean refreshImages)
237 {
238 this.refreshImages = refreshImages;
239 }
240
241 public boolean refreshImages()
242 {
243 return refreshImages;
244 }
245
246 public void setRefreshOthers(boolean refreshOthers)
247 {
248 this.refreshOthers = refreshOthers;
249 }
250
251 public boolean refreshOthers()
252 {
253 return refreshOthers;
254 }
255
256 public void setSaveRootDirectory(File saveRootDirectory)
257 {
258 this.saveRootDirectory = saveRootDirectory;
259 }
260
261 public File getSaveRootDirectory()
262 {
263 return saveRootDirectory;
264 }
265
266 public void setMailtoLogFile(File mailtoLogFile)
267 {
268 this.mailtoLogFile = mailtoLogFile;
269 }
270
271 public File getMailtoLogFile()
272 {
273 return mailtoLogFile;
274 }
275
276 public void setStartLocation(URL startLocation)
277 {
278 this.startLocation = startLocation;
279 }
280
281 public URL getStartLocation()
282 {
283 return startLocation;
284 }
285
286 public void setURLMatch(String urlMatch)
287 {
288 this.urlMatch = urlMatch;
289 }
290
291 public String getURLMatch()
292 {
293 return urlMatch;
294 }
295
296 public List getInterestingURLSubstrings()
297 {
298 return interestingURLSubstrings;
299 }
300
301 public void setInterestingURLSubstrings(List interestingURLSubstrings)
302 {
303 this.interestingURLSubstrings = interestingURLSubstrings;
304 }
305
306 public List getBoringURLSubstrings()
307 {
308 return boringURLSubstrings;
309 }
310
311 public void setBoringURLSubstrings(List boringURLSubstrings)
312 {
313 this.boringURLSubstrings = boringURLSubstrings;
314 }
315
316 public boolean isInteresting(URL u)
317 {
318 return matchURL(u, interestingURLSubstrings);
319 }
320
321 public boolean isBoring(URL u)
322 {
323 return matchURL(u, boringURLSubstrings);
324 }
325
326 private boolean matchURL(URL u, List substrings)
327 {
328 String str = u.toExternalForm();
329 for(Iterator i = substrings.iterator(); i.hasNext(); )
330 {
331 String substr = (String) i.next();
332 if(str.indexOf(substr) != -1)
333 {
334 return true;
335 }
336 }
337 return false;
338 }
339
340 public void setDepthFirstSearch(boolean depthFirst)
341 {
342 this.depthFirst = depthFirst;
343 }
344
345 public boolean isDepthFirstSearch()
346 {
347 return depthFirst;
348 }
349
350 public void setMaxDepth(int maxDepth)
351 {
352 this.maxDepth = maxDepth;
353 }
354
355 public int getMaxDepth()
356 {
357 return maxDepth;
358 }
359
360 public void setUserAgent(String userAgent)
361 {
362 this.userAgent = userAgent;
363 }
364
365 public String getUserAgent()
366 {
367 return userAgent;
368 }
369
370 public void setBasicAuthUser(String basicAuthUser)
371 {
372 this.basicAuthUser = basicAuthUser;
373 }
374
375 public String getBasicAuthUser()
376 {
377 return basicAuthUser;
378 }
379
380 public void setBasicAuthPassword(String basicAuthPassword)
381 {
382 this.basicAuthPassword = basicAuthPassword;
383 }
384
385 public String getBasicAuthPassword()
386 {
387 return basicAuthPassword;
388 }
389
390 public void setSpiderThreads(int spiderThreads)
391 {
392 this.spiderThreads = spiderThreads;
393 }
394
395 public int getSpiderThreads()
396 {
397 return spiderThreads;
398 }
399
400 public void setCheckpointInterval(long interval)
401 {
402 this.checkpointInterval = interval;
403 }
404
405 public long getCheckpointInterval()
406 {
407 return checkpointInterval;
408 }
409
410 public String toString()
411 {
412 return "depthFirst:\t" + depthFirst
413 + "\nmaxDepth:\t" + maxDepth
414 + "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
415 + "\nimageExtensions:\t" + fromSet(imageExtensions)
416 + "\nrefreshHTMLs:\t" + refreshHTMLs
417 + "\nrefreshImages:\t" + refreshImages
418 + "\nrefreshOthers:\t" + refreshOthers
419 + "\nsaveRootDirectory:\t" + saveRootDirectory
420 + "\nstartLocation:\t" + startLocation
421 + "\nurlMatch:\t" + urlMatch
422 + "\nuserAgent:\t" + userAgent
423 + "\nbasicAuthUser:\t" + basicAuthUser
424 + "\nbasicAuthPassword:\t" + "***"
425 + "\nspiderThreads:\t" + spiderThreads
426 + "\ncheckpointInterval:\t" + checkpointInterval;
427 }
428
429 private Set parseSet(String str)
430 {
431 _logClass.debug("parseSet(" + str + ")");
432 HashSet result = new HashSet();
433 StringTokenizer sTok = new StringTokenizer(str, ",");
434 while(sTok.hasMoreTokens())
435 {
436 String tok = sTok.nextToken().trim();
437 result.add(tok);
438 }
439 return result;
440 }
441
442 private String fromSet(Set s)
443 {
444 StringBuffer sb = new StringBuffer();
445 boolean first = true;
446 for(Iterator i = s.iterator(); i.hasNext(); )
447 {
448 String str = (String) i.next();
449 if(first)
450 {
451 first = false;
452 }
453 else
454 {
455 sb.append(",");
456 }
457 sb.append(str);
458 }
459 return sb.toString();
460 }
461
462 }