1
24
25
27 package weblech.spider;
28
29 import weblech.util.Logger;
30 import weblech.util.Log4j;
31
32 import java.util.*;
33 import java.io.*;
34 import java.net.URL;
35
36 import org.apache.log4j.Category;
37
38 public class Spider extends Logger implements Runnable, Constants
39 {
40
41 private SpiderConfig config;
42
46 private DownloadQueue queue;
47
52 private Set urlsDownloadedOrScheduled;
53
57 private Set urlsDownloading;
58
63 private int downloadsInProgress;
64
65 private boolean quit;
66
67 private int running;
68
69 private long lastCheckpoint;
70
71 public Spider(SpiderConfig config)
72 {
73 this.config = config;
74 queue = new DownloadQueue(config);
75 queue.queueURL(new URLToDownload(config.getStartLocation(), 0));
76 urlsDownloadedOrScheduled = new HashSet();
77 urlsDownloading = new HashSet();
78 downloadsInProgress = 0;
79 lastCheckpoint = 0;
80 }
81
82 public void start()
83 {
84 quit = false;
85 running = 0;
86
87 for(int i = 0; i < config.getSpiderThreads(); i++)
88 {
89 _logClass.info("Starting Spider thread");
90 Thread t = new Thread(this, "Spider-Thread-" + (i + 1));
91 t.start();
92 running++;
93 }
94 }
95
96 public void stop()
97 {
98 quit = true;
99 }
100
101 public boolean isRunning()
102 {
103 return running == 0;
104 }
105
106 private void checkpointIfNeeded()
107 {
108 if(config.getCheckpointInterval() == 0)
109 {
110 return;
111 }
112
113 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
114 {
115 synchronized(queue)
116 {
117 if(System.currentTimeMillis() - lastCheckpoint > config.getCheckpointInterval())
118 {
119 writeCheckpoint();
120 lastCheckpoint = System.currentTimeMillis();
121 }
122 }
123 }
124 }
125
126 private void writeCheckpoint()
127 {
128 _logClass.debug("writeCheckpoint()");
129 try
130 {
131 FileOutputStream fos = new FileOutputStream("spider.checkpoint", false);
132 ObjectOutputStream oos = new ObjectOutputStream(fos);
133 oos.writeObject(queue);
134 oos.writeObject(urlsDownloading);
135 oos.close();
136 }
137 catch(IOException ioe)
138 {
139 _logClass.warn("IO Exception attempting checkpoint: " + ioe.getMessage(), ioe);
140 }
141 }
142
143 public void readCheckpoint()
144 {
145 try
146 {
147 FileInputStream fis = new FileInputStream("spider.checkpoint");
148 ObjectInputStream ois = new ObjectInputStream(fis);
149 queue = (DownloadQueue) ois.readObject();
150 urlsDownloading = (Set) ois.readObject();
151 queue.queueURLs(urlsDownloading);
152 urlsDownloading.clear();
153 }
154 catch(Exception e)
155 {
156 _logClass.error("Caught exception reading checkpoint: " + e.getMessage(), e);
157 }
158 }
159
160 public void run()
161 {
162 HTMLParser htmlParser = new HTMLParser(config);
163 URLGetter urlGetter = new URLGetter(config);
164
165 System.err.println("queueSize = "+queueSize());
166 while((queueSize() > 0 || downloadsInProgress > 0) && quit == false)
167 {
168 checkpointIfNeeded();
169 if(queueSize() == 0 && downloadsInProgress > 0)
170 {
171 try
173 {
174 Thread.sleep(QUEUE_CHECK_INTERVAL);
175 }
176 catch(InterruptedException ignored)
177 {
178 }
179 continue;
181 }
182 else if(queueSize() == 0)
183 {
184 break;
185 }
186 URLToDownload nextURL;
187 synchronized(queue)
188 {
189 System.err.println("queueSize = "+queueSize());
190 if (queueSize() != 0) {
191 nextURL = queue.getNextInQueue();
192 downloadsInProgress++;
193 } else
194 break;
195 }
196 synchronized(urlsDownloading)
197 {
198 urlsDownloading.add(nextURL);
199 }
200 int newDepth = nextURL.getDepth() + 1;
201 int maxDepth = config.getMaxDepth();
202 synchronized(urlsDownloading)
203 {
204 urlsDownloading.remove(nextURL);
205 }
206 List newURLs = downloadURL(nextURL, urlGetter, htmlParser);
207
208 newURLs = filterURLs(newURLs);
209
210 ArrayList u2dsToQueue = new ArrayList();
211 for(Iterator i = newURLs.iterator(); i.hasNext(); )
212 {
213 URL u = (URL) i.next();
214 synchronized(urlsDownloadedOrScheduled)
216 {
217 if(!urlsDownloadedOrScheduled.contains(u)
218 && (maxDepth == 0 || newDepth <= maxDepth))
219 {
220 u2dsToQueue.add(new URLToDownload(u, nextURL.getURL(), newDepth));
221 urlsDownloadedOrScheduled.add(u);
222 }
223 }
224 }
225 synchronized(queue)
226 {
227 queue.queueURLs(u2dsToQueue);
228 downloadsInProgress--;
229 }
230 }
231 _logClass.info("Spider thread stopping");
232 running--;
233 }
234
235
238 private int queueSize()
239 {
240 synchronized(queue)
241 {
242 return queue.size();
243 }
244 }
245
246
251 private List downloadURL(URLToDownload url, URLGetter urlGetter, HTMLParser htmlParser)
252 {
253 _logClass.debug("downloadURL(" + url + ")");
254
255 URLObject obj = new URLObject(url.getURL(), config);
257 if(obj.existsOnDisk())
258 {
259 if(config.refreshHTMLs() && (obj.isHTML() || obj.isXML()))
260 {
261 _logClass.info("Q: [" + queue + "] " + url);
262 obj = urlGetter.getURL(url);
263 }
264 else if(config.refreshImages() && obj.isImage())
265 {
266 _logClass.info("Q: [" + queue + "] " + url);
267 obj = urlGetter.getURL(url);
268 }
269 }
270 else
271 {
272 _logClass.info("Q: [" + queue + "] " + url);
273 obj = urlGetter.getURL(url);
274 }
275
276 if(obj == null)
277 {
278 return new ArrayList();
279 }
280
281 if(!obj.existsOnDisk())
282 {
283 obj.writeToFile();
284 }
285
286 if(obj.isHTML() || obj.isXML())
287 {
288 return htmlParser.parseLinksInDocument(url.getURL(), obj.getStringContent());
289 }
290 else if(obj.isImage())
291 {
292 return new ArrayList();
293 }
294 else
295 {
296 _logClass.warn("Unsupported content type received: " + obj.getContentType());
297 _logClass.info("URL was " + url);
298 return new ArrayList();
299 }
300 }
301
302 private List filterURLs(List URLs)
303 {
304 String match = config.getURLMatch();
305 ArrayList retVal = new ArrayList();
306
307 synchronized(urlsDownloadedOrScheduled)
308 {
309 for(Iterator i = URLs.iterator(); i.hasNext(); )
310 {
311 URL u = (URL) i.next();
312 if(urlsDownloadedOrScheduled.contains(u))
313 {
314 continue;
315 }
316
317 String s = u.toExternalForm();
318 if(s.indexOf(match) != -1)
319 {
320 retVal.add(u);
321 }
322 }
323 }
324 return retVal;
325 }
326
327 }
328