1   /*
2    * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3    *
4    * Copyright (c) 2001 Brian Pitcher
5    *
6    * Permission is hereby granted, free of charge, to any person obtaining a
7    * copy of this software and associated documentation files (the "Software"),
8    * to deal in the Software without restriction, including without limitation
9    * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10   * and/or sell copies of the Software, and to permit persons to whom the
11   * Software is furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22   * SOFTWARE.
23   */
24  
25  // $Header: /cvsroot/weblech/weblech/src/weblech/spider/Spider.java,v 1.8 2002/06/09 11:34:38 weblech Exp $
26  
27  package weblech.spider;
28  
29  import weblech.util.Logger;
30  import weblech.util.Log4j;
31  
32  import java.util.*;
33  import java.io.*;
34  import java.net.URL;
35  
36  import org.apache.log4j.Category;
37  
38  public class Spider extends Logger implements Runnable, Constants
39  {
40      /** Config for the spider */
41      private SpiderConfig config;
42      /**
43       * Download queue.
44       * Thread safety: To access the queue, first synchronize on it.
45       */
46      private DownloadQueue queue;
47      /**
48       * Set of URLs downloaded or scheduled, so we don't download a
49       * URL more than once.
50       * Thread safety: To access the set, first synchronize on it.
51       */
52      private Set urlsDownloadedOrScheduled;
53      /**
54       * Set of URLs currently being downloaded by Spider threads.
55       * Thread safety: To access the set, first synchronize on it.
56       */
57      private Set urlsDownloading;
58      /**
59       * Number of downloads currently taking place.
60       * Thread safety: To modify this value, first synchronize on
61       *                the download queue.
62       */
63      private int downloadsInProgress;
64      /** Whether the spider should quit */
65      private boolean quit;
66      /** Count of running Spider threads. */
67      private int running;
68      /** Time we last checkpointed. */
69      private long lastCheckpoint;
70  
71      public Spider(SpiderConfig config)
72      {
73          this.config = config;
74          queue = new DownloadQueue(config);
75          queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
76          urlsDownloadedOrScheduled = new HashSet();
77          urlsDownloading = new HashSet();
78          downloadsInProgress = 0;
79          lastCheckpoint = 0;
80      }
81  
82      public void start()
83      {
84          quit = false;
85          running = 0;
86  
87          for(int i = 0; i < config.getSpiderThreads(); i++)
88          {
89              _logClass.info("Starting Spider thread");
90              Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
91              t.start();
92              running++;
93          }
94      }
95  
96      public void stop()
97      {
98          quit = true;
99      }
100 
101     public boolean isRunning()
102     {
103         return running == 0;
104     }
105 
106     private void checkpointIfNeeded()
107     {
108         if(config.getCheckpointInterval() == 0)
109         {
110             return;
111         }
112 
113         if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
114         {
115             synchronized(queue)
116             {
117                 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
118                 {
119                     writeCheckpoint();
120                     lastCheckpoint = System.currentTimeMillis();
121                 }
122             }
123         }
124     }
125 
126     private void writeCheckpoint()
127     {
128         _logClass.debug("writeCheckpoint()");
129         try
130         {
131             FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
132             ObjectOutputStream oos = new ObjectOutputStream(fos);
133             oos.writeObject(queue);
134             oos.writeObject(urlsDownloading);
135             oos.close();
136         }
137         catch(IOException ioe)
138         {
139             _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
140         }
141     }
142 
143     public void readCheckpoint()
144     {
145         try
146         {
147             FileInputStream fis = new FileInputStream("spider.checkpoint");
148             ObjectInputStream ois = new ObjectInputStream(fis);
149             queue = (DownloadQueue) ois.readObject();
150             urlsDownloading = (Set) ois.readObject();
151             queue.queueURLs(urlsDownloading);
152             urlsDownloading.clear();
153         }
154         catch(Exception e)
155         {
156             _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);
157         }
158     }
159 
160     public void run()
161     {
162         HTMLParser htmlParser = new HTMLParser(config);
163         URLGetter urlGetter = new URLGetter(config);
164 
165         System.err.println("queueSize = "+queueSize());
166         while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
167         {
168             checkpointIfNeeded();
169             if(queueSize() == 0 && downloadsInProgress > 0)
170             {
171                 // Wait for a download to finish before seeing if this thread should stop
172                 try
173                 {
174                     Thread.sleep(QUEUE_CHECK_INTERVAL);
175                 }
176                 catch(InterruptedException ignored)
177                 {
178                 }
179                 // Have another go at the loop
180                 continue;
181             }
182             else if(queueSize() == 0)
183             {
184                 break;
185             }
186             URLToDownload nextURL;
187             synchronized(queue)
188             {
189                 System.err.println("queueSize = "+queueSize());
190                 if (queueSize() != 0) {
191                         nextURL = queue.getNextInQueue();
192                         downloadsInProgress++;
193                 } else
194                     break;
195             }
196             synchronized(urlsDownloading)
197             {
198                 urlsDownloading.add(nextURL);
199             }
200             int newDepth = nextURL.getDepth() + 1;
201             int maxDepth = config.getMaxDepth();
202             synchronized(urlsDownloading)
203             {
204                 urlsDownloading.remove(nextURL);
205             }
206             List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
207 
208             newURLs = filterURLs(newURLs);
209 
210             ArrayList u2dsToQueue = new ArrayList();
211             for(Iterator i = newURLs.iterator(); i.hasNext(); )
212             {
213                 URL u = (URL) i.next();
214                 // Download if not yet downloaded, and the new depth is less than the maximum
215                 synchronized(urlsDownloadedOrScheduled)
216                 {
217                     if(!urlsDownloadedOrScheduled.contains(u)
218                     && (maxDepth == 0 || newDepth <= maxDepth))
219                     {
220                         u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
221                         urlsDownloadedOrScheduled.add(u);
222                     }
223                 }
224             }
225             synchronized(queue)
226             {
227                 queue.queueURLs(u2dsToQueue);
228                 downloadsInProgress--;
229             }
230         }
231         _logClass.info("Spider thread stopping");
232         running--;
233     }
234 
235     /**
236      * Get the size of the download queue in a thread-safe manner.
237      */
238     private int queueSize()
239     {
240         synchronized(queue)
241         {
242             return queue.size();
243         }
244     }
245 
246     /**
247      * Get a URL, and return new URLs that are referenced from it.
248      *
249      * @return A List of URL objects.
250      */
251     private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
252     {
253         _logClass.debug("downloadURL(" + url + ")");
254 
255         // Bail out early if image and already on disk
256         URLObject obj = new URLObject(url.getURL(), config);
257         if(obj.existsOnDisk())
258         {
259             if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
260             {
261                 _logClass.info("Q: [" + queue + "] " + url);
262                 obj = urlGetter.getURL(url);
263             }
264             else if(config.refreshImages() && obj.isImage())
265             {
266                 _logClass.info("Q: [" + queue + "] " + url);
267                 obj = urlGetter.getURL(url);
268             }
269         }
270         else
271         {
272             _logClass.info("Q: [" + queue + "] " + url);
273             obj = urlGetter.getURL(url);
274         }
275 
276         if(obj == null)
277         {
278             return new ArrayList();
279         }
280 
281         if(!obj.existsOnDisk())
282         {
283             obj.writeToFile();
284         }
285 
286         if(obj.isHTML() || obj.isXML())
287         {
288             return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
289         }
290         else if(obj.isImage())
291         {
292             return new ArrayList();
293         }
294         else
295         {
296             _logClass.warn("Unsupported content type received: " + obj.getContentType());
297             _logClass.info("URL was " + url);
298             return new ArrayList();
299         }
300     }
301 
302     private List filterURLs(List URLs)
303     {
304         String match = config.getURLMatch();
305         ArrayList retVal = new ArrayList();
306 
307         synchronized(urlsDownloadedOrScheduled)
308         {
309             for(Iterator i = URLs.iterator(); i.hasNext(); )
310             {
311                 URL u = (URL) i.next();
312                 if(urlsDownloadedOrScheduled.contains(u))
313                 {
314                     continue;
315                 }
316 
317                 String s = u.toExternalForm();
318                 if(s.indexOf(match) != -1)
319                 {
320                     retVal.add(u);
321                 }
322             }
323         }
324         return retVal;
325     }
326 
327 }
328