1   /*
2    * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3    *
4    * Copyright (c) 2001 Brian Pitcher
5    *
6    * Permission is hereby granted, free of charge, to any person obtaining a
7    * copy of this software and associated documentation files (the "Software"),
8    * to deal in the Software without restriction, including without limitation
9    * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10   * and/or sell copies of the Software, and to permit persons to whom the
11   * Software is furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22   * SOFTWARE.
23   */
24  
25  // $Header: /cvsroot/weblech/weblech/src/weblech/spider/SpiderConfig.java,v 1.9 2002/06/09 11:36:23 weblech Exp $
26  
27  package weblech.spider;
28  
29  import weblech.util.Logger;
30  
31  import java.io.File;
32  import java.io.Serializable;
33  import java.util.*;
34  import java.net.URL;
35  import java.net.MalformedURLException;
36  
37  public class SpiderConfig extends Logger implements Serializable
38  {
39      private File saveRootDirectory;
40      private File mailtoLogFile;
41  
42      private boolean refreshHTMLs;
43      private boolean refreshImages;
44      private boolean refreshOthers;
45  
46      private Set htmlExtensions;
47      private Set imageExtensions;
48  
49      private URL startLocation;
50      private String urlMatch;
51  
52      private List interestingURLSubstrings;
53      private List boringURLSubstrings;
54  
55      private boolean depthFirst;
56      private int maxDepth;
57  
58      private String userAgent;
59  
60      private String basicAuthUser;
61      private String basicAuthPassword;
62  
63      private int spiderThreads;
64  
65      private long checkpointInterval;
66  
67      /**
68       * Create a default config.
69       */
70      public SpiderConfig()
71      {
72          _logClass.debug("SpiderConfig()");
73  
74          saveRootDirectory = new File(".");
75          mailtoLogFile = new File("mailto.txt");
76  
77          refreshHTMLs = true;
78          refreshImages = false;
79          refreshOthers = false;
80  
81          htmlExtensions = new HashSet();
82          htmlExtensions.add("htm");
83          htmlExtensions.add("html");
84          htmlExtensions.add("shtml");
85  
86          imageExtensions = new HashSet();
87          imageExtensions.add("jpg");
88          imageExtensions.add("gif");
89          imageExtensions.add("png");
90  
91          urlMatch = null;
92          interestingURLSubstrings = new ArrayList();
93          boringURLSubstrings = new ArrayList();
94          depthFirst = false;
95          maxDepth = 0;
96  
97          userAgent = "WebLech Spider 0.01alpha";
98          basicAuthUser = "";
99          basicAuthPassword = "";
100 
101         spiderThreads = 1;
102 
103         checkpointInterval = 0;
104     }
105 
106     /**
107      * Create a config from a java.util.Properties object.
108      */
109     public SpiderConfig(Properties props)
110     {
111         _logClass.debug("SpiderConfig(props)");
112 
113         saveRootDirectory = new File(props.getProperty("saveRootDirectory", "."));
114         if(!saveRootDirectory.exists())
115         {
116             if(!saveRootDirectory.mkdirs())
117             {
118                 _logClass.error("Couldn't create root directory: " + saveRootDirectory);
119                 _logClass.info("Defaulting to . instead");
120                 saveRootDirectory = new File(".");
121             }
122         }
123         else if(!saveRootDirectory.isDirectory())
124         {
125             _logClass.error("Save root is not a directory: " + saveRootDirectory);
126             _logClass.info("Defaulting to . instead");
127             saveRootDirectory = new File(".");
128         }
129 
130         String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt");
131         // Check if absolute or relative name given
132         if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\"))
133         {
134             _logClass.debug("Using absolute file name " + mailtoFileStr);
135             mailtoLogFile = new File(mailtoFileStr);
136         }
137         else
138         {
139             _logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr);
140             mailtoLogFile = new File(saveRootDirectory.getPath() + "/" + mailtoFileStr);
141         }
142 
143         refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue();
144         refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue();
145         refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue();
146 
147         htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml"));
148         imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png"));
149 
150         String startLocStr = props.getProperty("startLocation");
151         if(startLocStr != null)
152         {
153             try
154             {
155                 startLocation = new URL(startLocStr);
156             }
157             catch(MalformedURLException murle)
158             {
159                 _logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle);
160             }
161         }
162         else
163         {
164             _logClass.warn("startLocation not found in properties");
165         }
166 
167         urlMatch = props.getProperty("urlMatch");
168 
169         interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs"));
170         boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs"));
171 
172         depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue();
173         try
174         {
175             String maxDepthStr = props.getProperty("maxDepth", "0");
176             maxDepth = Integer.parseInt(maxDepthStr);
177         }
178         catch(NumberFormatException nfe)
179         {
180             _logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe);
181             maxDepth = 1;
182         }
183 
184         userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha");
185         basicAuthUser = props.getProperty("basicAuthUser", "");
186         basicAuthPassword = props.getProperty("basicAuthPassword", "");
187 
188         try
189         {
190             String threadsStr = props.getProperty("spiderThreads", "1");
191             spiderThreads = Integer.parseInt(threadsStr);
192         }
193         catch(NumberFormatException nfe)
194         {
195             _logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe);
196             spiderThreads = 1;
197         }
198 
199         try
200         {
201             String intervalStr = props.getProperty("checkpointInterval", "0");
202             checkpointInterval = Long.parseLong(intervalStr);
203         }
204         catch(NumberFormatException nfe)
205         {
206             _logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe);
207             spiderThreads = 1;
208         }
209     }
210 
211     private List parsePropCommaSeparated(String str)
212     {
213         ArrayList result = new ArrayList();
214         if(str != null && str.length() > 0)
215         {
216             StringTokenizer tok = new StringTokenizer(str, ",");
217             while(tok.hasMoreTokens())
218             {
219                 result.add(tok.nextToken());
220             }
221         }
222         return result;
223     }
224 
225 
226     public void setRefreshHTMLs(boolean refreshHTMLs)
227     {
228         this.refreshHTMLs = refreshHTMLs;
229     }
230 
231     public boolean refreshHTMLs()
232     {
233         return refreshHTMLs;
234     }
235 
236     public void setRefreshImages(boolean refreshImages)
237     {
238         this.refreshImages = refreshImages;
239     }
240 
241     public boolean refreshImages()
242     {
243         return refreshImages;
244     }
245 
246     public void setRefreshOthers(boolean refreshOthers)
247     {
248         this.refreshOthers = refreshOthers;
249     }
250 
251     public boolean refreshOthers()
252     {
253         return refreshOthers;
254     }
255 
256     public void setSaveRootDirectory(File saveRootDirectory)
257     {
258         this.saveRootDirectory = saveRootDirectory;
259     }
260 
261     public File getSaveRootDirectory()
262     {
263         return saveRootDirectory;
264     }
265 
266     public void setMailtoLogFile(File mailtoLogFile)
267     {
268         this.mailtoLogFile = mailtoLogFile;
269     }
270 
271     public File getMailtoLogFile()
272     {
273         return mailtoLogFile;
274     }
275 
276     public void setStartLocation(URL startLocation)
277     {
278         this.startLocation = startLocation;
279     }
280 
281     public URL getStartLocation()
282     {
283         return startLocation;
284     }
285 
286     public void setURLMatch(String urlMatch)
287     {
288         this.urlMatch = urlMatch;
289     }
290 
291     public String getURLMatch()
292     {
293         return urlMatch;
294     }
295 
296     public List getInterestingURLSubstrings()
297     {
298         return interestingURLSubstrings;
299     }
300 
301     public void setInterestingURLSubstrings(List interestingURLSubstrings)
302     {
303         this.interestingURLSubstrings = interestingURLSubstrings;
304     }
305 
306     public List getBoringURLSubstrings()
307     {
308         return boringURLSubstrings;
309     }
310 
311     public void setBoringURLSubstrings(List boringURLSubstrings)
312     {
313         this.boringURLSubstrings = boringURLSubstrings;
314     }
315 
316     public boolean isInteresting(URL u)
317     {
318         return matchURL(u, interestingURLSubstrings);
319     }
320 
321     public boolean isBoring(URL u)
322     {
323         return matchURL(u, boringURLSubstrings);
324     }
325 
326     private boolean matchURL(URL u, List substrings)
327     {
328         String str = u.toExternalForm();
329         for(Iterator i = substrings.iterator(); i.hasNext(); )
330         {
331             String substr = (String) i.next();
332             if(str.indexOf(substr) != -1)
333             {
334                 return true;
335             }
336         }
337         return false;
338     }
339 
340     public void setDepthFirstSearch(boolean depthFirst)
341     {
342         this.depthFirst = depthFirst;
343     }
344 
345     public boolean isDepthFirstSearch()
346     {
347         return depthFirst;
348     }
349 
350     public void setMaxDepth(int maxDepth)
351     {
352         this.maxDepth = maxDepth;
353     }
354 
355     public int getMaxDepth()
356     {
357         return maxDepth;
358     }
359 
360     public void setUserAgent(String userAgent)
361     {
362         this.userAgent = userAgent;
363     }
364 
365     public String getUserAgent()
366     {
367         return userAgent;
368     }
369 
370     public void setBasicAuthUser(String basicAuthUser)
371     {
372         this.basicAuthUser = basicAuthUser;
373     }
374 
375     public String getBasicAuthUser()
376     {
377         return basicAuthUser;
378     }
379 
380     public void setBasicAuthPassword(String basicAuthPassword)
381     {
382         this.basicAuthPassword = basicAuthPassword;
383     }
384 
385     public String getBasicAuthPassword()
386     {
387         return basicAuthPassword;
388     }
389 
390     public void setSpiderThreads(int spiderThreads)
391     {
392         this.spiderThreads = spiderThreads;
393     }
394 
395     public int getSpiderThreads()
396     {
397         return spiderThreads;
398     }
399 
400     public void setCheckpointInterval(long interval)
401     {
402         this.checkpointInterval = interval;
403     }
404 
405     public long getCheckpointInterval()
406     {
407         return checkpointInterval;
408     }
409 
410     public String toString()
411     {
412         return "depthFirst:\t" + depthFirst
413            + "\nmaxDepth:\t" + maxDepth
414            + "\nhtmlExtensions:\t" + fromSet(htmlExtensions)
415            + "\nimageExtensions:\t" + fromSet(imageExtensions)
416            + "\nrefreshHTMLs:\t" + refreshHTMLs
417            + "\nrefreshImages:\t" + refreshImages
418            + "\nrefreshOthers:\t" + refreshOthers
419            + "\nsaveRootDirectory:\t" + saveRootDirectory
420            + "\nstartLocation:\t" + startLocation
421            + "\nurlMatch:\t" + urlMatch
422            + "\nuserAgent:\t" + userAgent
423            + "\nbasicAuthUser:\t" + basicAuthUser
424            + "\nbasicAuthPassword:\t" + "***"
425            + "\nspiderThreads:\t" + spiderThreads
426            + "\ncheckpointInterval:\t" + checkpointInterval;
427     }
428 
429     private Set parseSet(String str)
430     {
431         _logClass.debug("parseSet(" + str + ")");
432         HashSet result = new HashSet();
433         StringTokenizer sTok = new StringTokenizer(str, ",");
434         while(sTok.hasMoreTokens())
435         {
436             String tok = sTok.nextToken().trim();
437             result.add(tok);
438         }
439         return result;
440     }
441 
442     private String fromSet(Set s)
443     {
444         StringBuffer sb = new StringBuffer();
445         boolean first = true;
446         for(Iterator i = s.iterator(); i.hasNext(); )
447         {
448             String str = (String) i.next();
449             if(first)
450             {
451                 first = false;
452             }
453             else
454             {
455                 sb.append(",");
456             }
457             sb.append(str);
458         }
459         return sb.toString();
460     }
461 
462 } // End class SpiderConfig
463