blob: 16e52732c497177166005ef0405496254773ee0f [file] [log] [blame]
[email protected]2fac3752011-11-27 20:56:511#!/usr/bin/env python
[email protected]5883af982011-04-29 19:15:532# Copyright (c) 2011 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Downloads web pages with fillable forms after parsing through a set of links.
7
8Used for collecting web pages with forms. Used as a standalone script.
9This script assumes that it's run from within the same directory in which it's
10checked into. If this script were to be run elsewhere then the path for
11REGISTER_PAGE_DIR needs to be changed.
12
13This script assumes that third party modules are installed:
14httplib2, lxml, pycurl.
15
16Usage: webforms_aggregator.py [options] [single url or file containing urls]
17
18Options:
19 -l LOG_LEVEL, --log_level LOG_LEVEL
20 LOG_LEVEL: debug, info, warning or error [default: error]
21 -h, --help show this help message and exit
22"""
23
24import datetime
25import errno
26import logging
[email protected]732dd43b2011-06-29 18:51:5927import optparse
[email protected]5883af982011-04-29 19:15:5328import os
29import re
[email protected]732dd43b2011-06-29 18:51:5930# Needed in Linux so that PyCurl does not throw a segmentation fault.
31import signal
[email protected]5883af982011-04-29 19:15:5332import sys
33import tempfile
34import threading
35import time
[email protected]732dd43b2011-06-29 18:51:5936import urlparse
[email protected]5883af982011-04-29 19:15:5337
[email protected]732dd43b2011-06-29 18:51:5938import httplib2
[email protected]5883af982011-04-29 19:15:5339from lxml import html, etree
40import pycurl
41
42REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
43 'heuristics', 'input')
44NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
45
46FORM_LOCATION_COMMENT = 'Form Location: %s'
47HTML_FILE_PREFIX = 'grabber-'
48
49MAX_REDIRECTIONS = 10
50
51# Strings in a webpage that are indicative of a registration link.
52LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
53
54MAX_SAME_DOMAIN_URLS_NO = 30
55MAX_TOTAL_URLS_PER_DOMAIN = 300
56MAX_OPEN_FILES_NO = 500
57
58# URLs are selected for downloading with the following rules from the link
59# lists, giving more weight to the links that contain a link clue.
60CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
61CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
62SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
63GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
64
65MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
66
67
68class Retriever(object):
69 """Download, parse, and check if the web page contains a registration form.
70
[email protected]732dd43b2011-06-29 18:51:5971 The objects of this class has a one to one relation with the web pages. For
72 each page that is downloaded and parsed an object of this class is created.
73 Each Retriever object creates a curl object. This object is added to the curl
74 multi object of the crawler object so that the corresponding pages gets
75 downloaded.
[email protected]5883af982011-04-29 19:15:5376 """
77 logger = logging.getLogger(__name__)
78
79 def __init__(self, url, domain, cookie_file):
80 """Initializes a Retriever object.
81
82 Args:
83 url: url to download page from.
84 domain: only links with this domain will be retrieved.
85 cookie_file: the name of a cookie file, needed for pages that use session
[email protected]732dd43b2011-06-29 18:51:5986 cookies to change their contents.
[email protected]5883af982011-04-29 19:15:5387 """
88 self._url = url
89 self._domain = domain
90 self._html_content = ''
91
92 # Http links without clues from LINK_CLUES.
93 self._general_links = []
94 # Http links that contain a clue from LINK_CLUES.
95 self._clues_general_links = []
96 # Https links that do not contain any clues from LINK_CLUES.
97 self._secure_links = []
98 # Https links that contain a clue from LINK_CLUES.
99 self._clues_secure_links = []
100 self._cookie_file = cookie_file
101 self._curl_object = None
102
103 def __del__(self):
104 """Cleans up before this object is destroyed.
105
106 The function closes the corresponding curl object that does the downloading.
107 """
108 if self._curl_object:
109 self._curl_object.close()
110
[email protected]5883af982011-04-29 19:15:53111 def _AddLink(self, link):
112 """Adds url |link|, if not already present, to the appropriate list.
113
114 The link only gets added to the single list that is appopriate for it:
115 _secure_links, _general_links, _clues_secure_links or _clues_general_links.
116
117 Args:
118 link: the url that is inserted to the appropriate links list.
119 """
[email protected]732dd43b2011-06-29 18:51:59120 # Handles sites with unicode URLs.
121 if isinstance(link, unicode):
122 # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
123 link = httplib2.iri2uri(link).encode('utf-8')
124 link_parsed = urlparse.urlparse(link)
[email protected]5883af982011-04-29 19:15:53125 link_lists = [self._clues_secure_links, self._secure_links,
126 self._clues_general_links, self._general_links]
127 # Checks that the registration page is within the domain.
[email protected]732dd43b2011-06-29 18:51:59128 if (self._domain in link_parsed[1] and
129 all(link not in x for x in link_lists)):
[email protected]5883af982011-04-29 19:15:53130 for clue in LINK_CLUES:
131 if clue in link.lower():
132 if link_parsed[0].startswith('https'):
133 self._clues_secure_links.append(link)
134 return
135 else:
136 self._clues_general_links.append(link)
137 return
138 if link_parsed[0].startswith('https'): # No clues found in the link.
139 self._secure_links.append(link)
140 else:
141 self._general_links.append(link)
142
143 def ParseAndGetLinks(self):
144 """Parses downloaded page and gets url link for non registration page.
145
146 Checks if current page contains a registration page and if not it gets
147 the url links. If it is a registration page, it saves it in a file as
148 'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
149 and it returns True. Otherwise it returns False.
150
151 Returns:
152 True if current page contains a registration form, and False otherwise.
153
154 Raises:
155 IOError: When can't write to the file.
156 """
157 if not self._domain:
158 self.logger.error('Error: self._domain was not set')
159 sys.exit(1)
[email protected]732dd43b2011-06-29 18:51:59160 match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1',
161 self._html_content)
162 for group_list in match_list:
163 link = group_list[1]
164 if link.startswith('//'):
165 link = urlparse.urljoin(self._url, link)
166 self._AddLink(link)
[email protected]5883af982011-04-29 19:15:53167 try:
168 tree = html.fromstring(self._html_content, parser=html.HTMLParser())
[email protected]732dd43b2011-06-29 18:51:59169 except etree.LxmlError:
170 self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s',
171 self._url)
[email protected]5883af982011-04-29 19:15:53172 return False
173 try:
[email protected]732dd43b2011-06-29 18:51:59174 body = tree.iter('body').next()
[email protected]5883af982011-04-29 19:15:53175 except StopIteration:
[email protected]732dd43b2011-06-29 18:51:59176 self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s',
177 self._url)
[email protected]5883af982011-04-29 19:15:53178 return False
179
180 # Get a list of all input elements with attribute type='password'
181 password_elements = list(body.iterfind('.//input[@type="password"]'))
182 # Check for multiple password elements to distinguish between a login form
183 # and a registration form (Password field and Confirm Password field).
184 if password_elements and len(password_elements) >= 2:
185 form_elements = []
186 for password_elem in password_elements:
187 form_elem = password_elem.xpath('ancestor::form[1]')
188 if not form_elem:
189 continue
190 if not form_elem[0] in form_elements:
191 form_elements.append(form_elem[0])
192 else:
193 # Confirms that the page contains a registration form if two passwords
194 # are contained in the same form for form_elem[0].
195 if not os.path.isdir(REGISTER_PAGE_DIR):
[email protected]732dd43b2011-06-29 18:51:59196 os.makedirs(REGISTER_PAGE_DIR)
[email protected]5883af982011-04-29 19:15:53197 # Locate the HTML tag and insert the form location comment after it.
198 html_tag = tree.iter('html').next()
199 comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
200 html_tag.insert(0, comment)
201 # Create a new file and save the HTML registration page code.
202 f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
[email protected]732dd43b2011-06-29 18:51:59203 self._domain), 'w')
[email protected]5883af982011-04-29 19:15:53204 try:
205 f.write(html.tostring(tree, pretty_print=True))
206 except IOError as e:
207 self.logger.error('Error: %s', e)
208 raise
209 finally:
210 f.close()
211 return True # Registration page found.
212 # Indicates page is not a registration page and links must be parsed.
213 link_elements = list(body.iter('a'))
214 for link_elem in link_elements:
215 link = link_elem.get('href')
216 if not link or '#' == link[0]:
217 continue
[email protected]732dd43b2011-06-29 18:51:59218 link = urlparse.urljoin(self._url, link)
219 link_parsed = urlparse.urlparse(link)
[email protected]5883af982011-04-29 19:15:53220 if not link_parsed[0].startswith('http'):
221 continue
222 self._AddLink(link)
223 return False # Registration page not found.
224
[email protected]5883af982011-04-29 19:15:53225 def InitRequestHead(self):
226 """Initializes curl object for a HEAD request.
227
228 A HEAD request is initiated so that we can check from the headers if this is
229 a valid HTML file. If it is not a valid HTML file, then we do not initiate a
230 GET request, saving any unnecessary downloadings.
231 """
232 self._curl_object = pycurl.Curl()
[email protected]5883af982011-04-29 19:15:53233 self._curl_object.setopt(pycurl.URL, self._url)
234 # The following line fixes the GnuTLS package error that pycurl depends
235 # on for getting https pages.
236 self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
[email protected]5883af982011-04-29 19:15:53237 self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
238 self._curl_object.setopt(pycurl.NOBODY, True)
239 self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
240 self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
241 self._curl_object.setopt(pycurl.FAILONERROR, False)
242 self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
243 self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
244 self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
245 self._curl_object.setopt(pycurl.TIMEOUT, 300)
246 self._curl_object.setopt(pycurl.NOSIGNAL, 1)
247
248 def InitRequestGet(self):
249 """Initializes curl object for a GET request.
250
251 This is called only for valid HTML files. The Pycurl makes a GET request.
252 The page begins to download, but since not all the data of the pages comes
253 at once. When some of the data on the page is downloaded Pycurl will put
254 this data in the buffer. The data is appended to the end of the page until
255 everything is downloaded.
256 """
257 self._curl_object.setopt(pycurl.NOBODY, False)
[email protected]5883af982011-04-29 19:15:53258 self._curl_object.setopt(
259 pycurl.WRITEFUNCTION, lambda buff: setattr(
260 self, '_html_content', self._html_content + buff))
261
262 def Download(self):
263 """Downloads the self._url page.
264
265 It first does a HEAD request and then it proceeds to a GET request.
266 It uses a curl object for a single download. This function is called only
267 once for the initial url of a site when we still don't have more urls from a
268 domain.
269
270 Returns:
271 True, if the downloaded page is valid HTML code, or False otherwise.
272 """
273 self.InitRequestHead()
274 try:
275 self._curl_object.perform()
276 except pycurl.error as e:
277 self.logger.error('Error: %s, url: %s', e, self._url)
278 return False
[email protected]732dd43b2011-06-29 18:51:59279 self._url = urlparse.urljoin(
280 self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL))
[email protected]5883af982011-04-29 19:15:53281 content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
282 if content_type and ('text/html' in content_type.lower()):
283 self.InitRequestGet()
284 try:
285 self._curl_object.perform()
286 except pycurl.error as e:
287 self.logger.error('Error: %s, url: %s', e, self._url)
288 return False
289 return True
290 else:
[email protected]732dd43b2011-06-29 18:51:59291 self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url)
[email protected]5883af982011-04-29 19:15:53292 return False
293
294 def Run(self):
[email protected]732dd43b2011-06-29 18:51:59295 """Called only once for the initial url when we do not have more urls.
[email protected]5883af982011-04-29 19:15:53296
[email protected]732dd43b2011-06-29 18:51:59297 Downloads the originally-specified site url, parses it and gets the links.
[email protected]5883af982011-04-29 19:15:53298
299 Returns:
300 True, if a registration page is found, and False otherwise.
301 """
[email protected]732dd43b2011-06-29 18:51:59302 if self.Download():
[email protected]5883af982011-04-29 19:15:53303 if not self._domain:
[email protected]732dd43b2011-06-29 18:51:59304 url_parsed = urlparse.urlparse(self._url)
[email protected]5883af982011-04-29 19:15:53305 self._domain = url_parsed[1]
[email protected]732dd43b2011-06-29 18:51:59306 if self._domain.startswith('www'):
307 self._domain = '.'.join(self._domain.split('.')[1:])
[email protected]5883af982011-04-29 19:15:53308 if self.ParseAndGetLinks():
309 return True
310 return False
311
312
313class Crawler(object):
314 """Crawls a site until a registration page is found or max level is reached.
315
316 Creates, uses and destroys Retriever objects. Creates a cookie temp file
317 needed for session cookies. It keeps track of 'visited links' and
318 'links to visit' of the site. To do this it uses the links discovered from
319 each Retriever object. Use Run() to crawl the site.
320 """
321 try:
[email protected]5883af982011-04-29 19:15:53322 signal.signal(signal.SIGPIPE, signal.SIG_IGN)
323 except ImportError:
324 pass
325 logger = logging.getLogger(__name__)
[email protected]5883af982011-04-29 19:15:53326
327 def __init__(self, url, logging_level=None):
328 """Init crawler URL, links lists, logger, and creates a cookie temp file.
329
330 The cookie temp file is needed for session cookies.
331
332 Args:
333 url: the initial "seed" url of the site.
334 logging_level: the desired verbosity level, default is None.
335 """
336 if logging_level:
[email protected]5883af982011-04-29 19:15:53337 self.logger.setLevel(logging_level)
[email protected]5883af982011-04-29 19:15:53338
339 self.url_error = False
[email protected]732dd43b2011-06-29 18:51:59340 url_parsed = urlparse.urlparse(url)
[email protected]5883af982011-04-29 19:15:53341 if not url_parsed[0].startswith('http'):
342 self.logger.error(
343 'Error: "%s" does not begin with http:// or https://', url)
344 self.url_error = True
345 return
[email protected]732dd43b2011-06-29 18:51:59346 # Example: if url is 'http://www.example.com?name=john' then value [1] or
347 # network location is 'www.example.com'.
[email protected]5883af982011-04-29 19:15:53348 if not url_parsed[1]:
349 self.logger.error('Error: "%s" is not a valid url', url)
350 self.url_error = True
351 return
[email protected]5883af982011-04-29 19:15:53352 self._url = url
353 self._domain = ''
354 # Http links that contain a clue from LINK_CLUES.
355 self._clues_general_links = []
356 # Http links that do not contain any clue from LINK_CLUES.
357 self._general_links = []
358 # Https links that contain a clue from LINK_CLUES.
359 self._clues_secure_links = []
360 # Https links that do not contain any clue from LINK_CLUES.
361 self._secure_links = []
362 # All links downloaded and parsed so far.
363 self._links_visited = []
364 self._retrievers_list = []
365 self._cookie_file = tempfile.NamedTemporaryFile(
366 suffix='.cookie', delete=False)
367 self._cookie_file.close()
368 self._cookie_file = self._cookie_file.name # Keep only the filename.
369
370 def __del__(self):
371 """Deletes cookie file when Crawler instances are destroyed."""
372 if hasattr(self, '_cookie_file'):
373 self.logger.info('Deleting cookie file %s ...', self._cookie_file)
374 os.unlink(self._cookie_file)
375
376 def _MultiPerform(self, curl_multi_object):
377 """Performs concurrent downloads using a CurlMulti object.
378
379 Args:
380 curl_multi_object: a curl object that downloads multiple pages
[email protected]732dd43b2011-06-29 18:51:59381 concurrently. The class of this object is |pycurl.CurlMulti|.
[email protected]5883af982011-04-29 19:15:53382 """
[email protected]732dd43b2011-06-29 18:51:59383 # Following code uses the example from section for the CurlMulti object
384 # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
[email protected]5883af982011-04-29 19:15:53385 while True:
386 ret, no_handles = curl_multi_object.perform()
[email protected]5883af982011-04-29 19:15:53387 if ret != pycurl.E_CALL_MULTI_PERFORM:
388 break
[email protected]732dd43b2011-06-29 18:51:59389 while no_handles:
390 curl_multi_object.select(1.0)
391 while True:
392 ret, no_handles = curl_multi_object.perform()
393 if ret != pycurl.E_CALL_MULTI_PERFORM:
394 break
[email protected]5883af982011-04-29 19:15:53395
396 def _GetLinksPages(self, curl_multi_object):
397 """Downloads many pages concurrently using a CurlMulti Object.
398
399 Creates many Retriever objects and adds them to a list. The constant
400 MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
401 concurrently from the same domain using the pycurl multi object. It's
402 currently set to 30 URLs. These URLs are taken from the links lists, which
403 are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
404 each list during each iteration.
405
406 Example of the rules:
407 3/10 from csl results in 9 URLs
408 3/10 from cgl results in 9 URLs
409 2/10 from sl results in 6 URLs
410 2/10 from gl results in 6 URLs
411
412 Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
[email protected]732dd43b2011-06-29 18:51:59413 If these lists have fewer items than the defined rules, such as if a site
414 does not contain any secure links, then csl and sl lists will be of 0 length
415 and only 15 pages would be downloaded concurrently from the same domain.
416
417 Since 30 URLs can be handled concurrently, the number of links taken from
418 other lists can be increased. This means that we can take 24 links from the
419 cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less
420 than 24 links, e.g. there are only 21 links, then only 9 links may be taken
421 from gl so ) + 21 + 0 + 9 = 30.
[email protected]5883af982011-04-29 19:15:53422
423 Args:
424 curl_multi_object: Each Retriever object has a curl object which is
[email protected]732dd43b2011-06-29 18:51:59425 added to the CurlMulti Object.
[email protected]5883af982011-04-29 19:15:53426 """
427 self._retrievers_list = []
428
429 csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
430 cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
431 sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
432 gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
433
434 # If some links within the list have fewer items than needed, the missing
435 # links will be taken by the following priority: csl, cgl, sl, gl.
436 # c: clues, s: secure, g: general, l: list.
437 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
438 if spare_links > 0:
439 csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
440 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
441 if spare_links > 0:
442 cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
443 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
444 if spare_links > 0:
445 sl_no = min(sl_no + spare_links, len(self._secure_links))
446 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
447 if spare_links > 0:
448 gl_no = min(gl_no + spare_links, len(self._general_links))
449
[email protected]5883af982011-04-29 19:15:53450 for no_of_links, links in [
451 (csl_no, self._clues_secure_links),
452 (sl_no, self._secure_links),
453 (cgl_no, self._clues_general_links),
454 (gl_no, self._general_links)]:
455 for i in xrange(no_of_links):
456 if not links:
457 break
458 url = links.pop(0)
459 self._links_visited.append(url)
460 r = Retriever(url, self._domain, self._cookie_file)
461 r.InitRequestHead()
[email protected]732dd43b2011-06-29 18:51:59462 curl_multi_object.add_handle(r._curl_object)
[email protected]5883af982011-04-29 19:15:53463 self._retrievers_list.append(r)
464
465 if self._retrievers_list:
466 try:
[email protected]732dd43b2011-06-29 18:51:59467 self._MultiPerform(curl_multi_object)
[email protected]5883af982011-04-29 19:15:53468 except pycurl.error as e:
469 self.logger.error('Error: %s, url: %s', e, self._url)
470 finally:
[email protected]732dd43b2011-06-29 18:51:59471 for r in self._retrievers_list:
472 curl_multi_object.remove_handle(r._curl_object)
473 # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing
474 # items from the iterated list.
475 for r in self._retrievers_list[:]:
476 r._url = urlparse.urljoin(r._url, r._curl_object.getinfo(
477 pycurl.EFFECTIVE_URL))
[email protected]5883af982011-04-29 19:15:53478 content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
479 if content_type and ('text/html' in content_type.lower()):
480 r.InitRequestGet()
[email protected]732dd43b2011-06-29 18:51:59481 curl_multi_object.add_handle(r._curl_object)
[email protected]5883af982011-04-29 19:15:53482 else:
[email protected]732dd43b2011-06-29 18:51:59483 self._retrievers_list.remove(r)
484 self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url)
[email protected]5883af982011-04-29 19:15:53485 if self._retrievers_list:
486 try:
[email protected]732dd43b2011-06-29 18:51:59487 self._MultiPerform(curl_multi_object)
488 except pycurl.error as e:
[email protected]5883af982011-04-29 19:15:53489 self.logger.error('Error: %s, url: %s', e, self._url)
490 finally:
491 for r in self._retrievers_list:
[email protected]732dd43b2011-06-29 18:51:59492 curl_multi_object.remove_handle(r._curl_object)
[email protected]5883af982011-04-29 19:15:53493 self.logger.info('Downloaded: %s', r._url)
494
495 def _LogRegPageFound(self, retriever):
496 """Display logging for registration page found.
497
498 Args:
499 retriever: The object that has retrieved the page.
500 """
501 self.logger.info('\t##############################################')
502 self.logger.info('\t### %s ###', retriever._domain)
503 self.logger.info('\t##############################################')
504 self.logger.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!')
505 self.logger.info('\t%s', retriever._url)
506 self.logger.info('\t##############################################')
507
508 def _GetNewLinks(self, retriever):
509 """Appends new links discovered by each retriever to the appropriate lists.
510
511 Links are copied to the links list of the crawler object, which holds all
512 the links found from all retrievers that the crawler object created. The
513 Crawler object exists as far as a specific site is examined and the
514 Retriever object exists as far as a page of this site is examined.
515
516 Args:
517 retriever: a temporary object that downloads a specific page, parses the
[email protected]732dd43b2011-06-29 18:51:59518 content and gets the page's href link.
[email protected]5883af982011-04-29 19:15:53519 """
520 for link in retriever._clues_secure_links:
521 if (not link in self._clues_secure_links and
522 not link in self._links_visited):
523 self._clues_secure_links.append(link)
524 for link in retriever._secure_links:
525 if (not link in self._secure_links and
526 not link in self._links_visited):
527 self._secure_links.append(link)
528 for link in retriever._clues_general_links:
529 if (not link in self._clues_general_links and
530 not link in self._links_visited):
531 self._clues_general_links.append(link)
532 for link in retriever._general_links:
533 if (not link in self._general_links and
534 not link in self._links_visited):
535 self._general_links.append(link)
536
537 def Run(self):
538 """Runs the Crawler.
539
540 Creates a Retriever object and calls its run method to get the first links,
541 and then uses CurlMulti object and creates many Retriever objects to get
542 the subsequent pages.
543
544 The number of pages (=Retriever objs) created each time is restricted by
545 MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
546 and parse their pages, we do the same again. The number of total pages
547 visited is kept in urls_visited.
548 If no registration page is found, the Crawler object will give up its try
549 after MAX_TOTAL_URLS_PER_DOMAIN is reached.
550
551 Returns:
552 True is returned if registration page is found, or False otherwise.
553 """
554 reg_page_found = False
555 if self.url_error:
556 return False
557 r = Retriever(self._url, self._domain, self._cookie_file)
558 if r.Run():
559 self._LogRegPageFound(r)
560 reg_page_found = True
561 else:
562 self._url = r._url
563 self._domain = r._domain
564 self.logger.info('url to crawl: %s', self._url)
565 self.logger.info('domain: %s', self._domain)
566 self._links_visited.append(r._url)
567 self._GetNewLinks(r)
568 urls_visited = 1
569 while True:
570 if (not (self._clues_secure_links or self._secure_links or
[email protected]732dd43b2011-06-29 18:51:59571 self._clues_general_links or self._general_links) or
572 urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN):
[email protected]5883af982011-04-29 19:15:53573 break # Registration page not found.
574 m = pycurl.CurlMulti()
575 self._GetLinksPages(m)
[email protected]732dd43b2011-06-29 18:51:59576 urls_visited += len(self._retrievers_list)
[email protected]5883af982011-04-29 19:15:53577 self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
578 self._domain, urls_visited)
579 for r in self._retrievers_list:
580 if r.ParseAndGetLinks():
581 self._LogRegPageFound(r)
582 reg_page_found = True
583 break
584 else:
585 self.logger.info('parsed: %s', r._url)
586 self._GetNewLinks(r)
587 m.close()
588 if reg_page_found:
589 break
590 while self._retrievers_list:
591 r = self._retrievers_list.pop()
592 return reg_page_found
593
594
595class WorkerThread(threading.Thread):
596 """Creates a new thread of execution."""
597 def __init__(self, url):
[email protected]732dd43b2011-06-29 18:51:59598 """Creates _url and page_found attri to populate urls_with_no_reg_page file.
[email protected]5883af982011-04-29 19:15:53599
600 Used after thread's termination for the creation of a file with a list of
601 the urls for which a registration page wasn't found.
602
603 Args:
604 url: will be used as an argument to create a Crawler object later.
605 """
606 threading.Thread.__init__(self)
607 self._url = url
[email protected]732dd43b2011-06-29 18:51:59608 self.page_found = False
[email protected]5883af982011-04-29 19:15:53609
610 def run(self):
611 """Execution of thread creates a Crawler object and runs it.
612
613 Caution: this function name should not be changed to 'Run' or any other
[email protected]732dd43b2011-06-29 18:51:59614 names because it is overriding the 'run' method of the 'threading.Thread'
[email protected]5883af982011-04-29 19:15:53615 class. Otherwise it will never be called.
616 """
[email protected]732dd43b2011-06-29 18:51:59617 self.page_found = Crawler(self._url).Run()
[email protected]5883af982011-04-29 19:15:53618
619
620class ThreadedCrawler(object):
621 """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
622
623 The crawler object runs concurrently, examining one site each.
624 """
625 logger = logging.getLogger(__name__)
[email protected]5883af982011-04-29 19:15:53626
627 def __init__(self, urls_file, logging_level=None):
628 """Creates threaded Crawler objects.
629
630 Args:
[email protected]732dd43b2011-06-29 18:51:59631 urls_file: a text file containing a URL in each line.
[email protected]5883af982011-04-29 19:15:53632 logging_level: verbosity level, default is None.
633
634 Raises:
635 IOError: If cannot find URLs from the list.
636 """
637 if logging_level:
[email protected]5883af982011-04-29 19:15:53638 self.logger.setLevel(logging_level)
[email protected]732dd43b2011-06-29 18:51:59639
[email protected]5883af982011-04-29 19:15:53640 self._urls_list = []
641 f = open(urls_file)
642 try:
643 for url in f.readlines():
644 url = url.strip()
[email protected]732dd43b2011-06-29 18:51:59645 if not urlparse.urlparse(url)[0].startswith('http'):
[email protected]5883af982011-04-29 19:15:53646 self.logger.info(
647 '%s: skipping this (does not begin with "http://")', url)
648 continue
649 self._urls_list.append(url)
650 except IOError as e:
651 self.logger.error('Error: %s', e)
652 raise
653 finally:
654 f.close()
655 if not self._urls_list:
[email protected]732dd43b2011-06-29 18:51:59656 error_msg = 'No URLs were found.'
657 self.logger.error('ERROR: %s', error_msg)
658 raise IOError(error_msg)
[email protected]5883af982011-04-29 19:15:53659
660 def Run(self):
661 """Runs Crawler objects using python threads.
662
663 Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
664
[email protected]732dd43b2011-06-29 18:51:59665 Returns:
666 The number of registration pages found. -1 if no URLs are given.
667
[email protected]5883af982011-04-29 19:15:53668 Raises:
669 OSError: When creating the same directory that already exists.
670 """
[email protected]5883af982011-04-29 19:15:53671 if self._urls_list:
672 allThreads = []
673 # originalNumThreads is the number of threads just before the
674 # ThreadedCrawler starts creating new threads. As a standalone script it
675 # will be 1.
676 originalNumThreads = threading.active_count()
677 for url in self._urls_list:
[email protected]732dd43b2011-06-29 18:51:59678 self.logger.info('URL fed to a crawler thread: %s', url)
[email protected]5883af982011-04-29 19:15:53679 t = WorkerThread(url)
680 t.start()
681 allThreads.append(t)
682 while threading.active_count() >= (
683 MAX_ALLOWED_THREADS + originalNumThreads):
684 time.sleep(.4)
685 while threading.active_count() > originalNumThreads:
686 time.sleep(.4)
687 self.logger.info('----------------')
688 self.logger.info('--- FINISHED ---')
689 self.logger.info('----------------')
690 urls_no = 0
691 urls_not_found_no = 0
692 not_file_name = os.path.join(
693 REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
694 not_file_dir = os.path.dirname(not_file_name)
695 try:
696 os.makedirs(not_file_dir)
697 except OSError as e:
698 if e.errno != errno.EEXIST:
699 raise
700 fnot = open(not_file_name, 'wb')
701 try:
702 for t in sorted(allThreads, key=lambda t: t._url):
703 urls_no += 1
[email protected]732dd43b2011-06-29 18:51:59704 if not t.page_found:
[email protected]5883af982011-04-29 19:15:53705 urls_not_found_no += 1
706 fnot.write('%s' % t._url)
707 fnot.write(os.linesep)
708 except IOError as e:
709 self.logger.error('Error: %s', e)
710 finally:
711 fnot.close()
[email protected]732dd43b2011-06-29 18:51:59712 self.logger.info('Total number of URLs given: %d\n', urls_no)
[email protected]5883af982011-04-29 19:15:53713 self.logger.info(
714 'Registration pages found: %d\n', (urls_no - urls_not_found_no))
715 self.logger.info(
716 'URLs that did not return a registration page: %d\n',
717 urls_not_found_no)
[email protected]732dd43b2011-06-29 18:51:59718 return urls_no - urls_not_found_no
[email protected]5883af982011-04-29 19:15:53719 else:
[email protected]732dd43b2011-06-29 18:51:59720 self.logger.error('Error: no URLs were found.')
721 return -1
[email protected]5883af982011-04-29 19:15:53722
723
724def main():
[email protected]5883af982011-04-29 19:15:53725 usage = 'usage: %prog [options] single_url_or_urls_filename'
[email protected]732dd43b2011-06-29 18:51:59726 parser = optparse.OptionParser(usage)
[email protected]5883af982011-04-29 19:15:53727 parser.add_option(
728 '-l', '--log_level', metavar='LOG_LEVEL', default='error',
729 help='LOG_LEVEL: debug, info, warning or error [default: %default]')
730
731 (options, args) = parser.parse_args()
[email protected]732dd43b2011-06-29 18:51:59732 options.log_level = options.log_level.upper()
733 if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
[email protected]5883af982011-04-29 19:15:53734 print 'Wrong log_level argument.'
735 parser.print_help()
[email protected]2fac3752011-11-27 20:56:51736 return 1
[email protected]732dd43b2011-06-29 18:51:59737 options.log_level = getattr(logging, options.log_level)
[email protected]5883af982011-04-29 19:15:53738
739 if len(args) != 1:
[email protected]732dd43b2011-06-29 18:51:59740 parser.error('Wrong number of arguments.')
[email protected]5883af982011-04-29 19:15:53741
[email protected]732dd43b2011-06-29 18:51:59742 logger = logging.getLogger(__name__)
743 if options.log_level:
744 console = logging.StreamHandler()
745 logger.addHandler(console)
746 logger.setLevel(options.log_level)
747
748 arg_is_a_file = os.path.isfile(args[0])
749 if arg_is_a_file:
750 CrawlerClass = ThreadedCrawler
[email protected]5883af982011-04-29 19:15:53751 else:
[email protected]732dd43b2011-06-29 18:51:59752 CrawlerClass = Crawler
753 t0 = datetime.datetime.now()
754 c = CrawlerClass(args[0], options.log_level)
755 c.Run()
756 if not arg_is_a_file and c.url_error:
757 logger.error(
758 'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0])
759 t1 = datetime.datetime.now()
760 delta_t = t1 - t0
761 logger.info('Started at: %s\n', t0)
762 logger.info('Ended at: %s\n', t1)
763 logger.info('Total execution time: %s\n', delta_t)
[email protected]2fac3752011-11-27 20:56:51764 return 0
[email protected]5883af982011-04-29 19:15:53765
766
767if __name__ == "__main__":
[email protected]2fac3752011-11-27 20:56:51768 sys.exit(main())