[email protected] | 2fac375 | 2011-11-27 20:56:51 | [diff] [blame] | 1 | #!/usr/bin/env python |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 2 | # Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Downloads web pages with fillable forms after parsing through a set of links. |
| 7 | |
| 8 | Used for collecting web pages with forms. Used as a standalone script. |
| 9 | This script assumes that it's run from within the same directory in which it's |
| 10 | checked into. If this script were to be run elsewhere then the path for |
| 11 | REGISTER_PAGE_DIR needs to be changed. |
| 12 | |
| 13 | This script assumes that third party modules are installed: |
| 14 | httplib2, lxml, pycurl. |
| 15 | |
| 16 | Usage: webforms_aggregator.py [options] [single url or file containing urls] |
| 17 | |
| 18 | Options: |
| 19 | -l LOG_LEVEL, --log_level LOG_LEVEL |
| 20 | LOG_LEVEL: debug, info, warning or error [default: error] |
| 21 | -h, --help show this help message and exit |
| 22 | """ |
| 23 | |
| 24 | import datetime |
| 25 | import errno |
| 26 | import logging |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 27 | import optparse |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 28 | import os |
| 29 | import re |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 30 | # Needed in Linux so that PyCurl does not throw a segmentation fault. |
| 31 | import signal |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 32 | import sys |
| 33 | import tempfile |
| 34 | import threading |
| 35 | import time |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 36 | import urlparse |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 37 | |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 38 | import httplib2 |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 39 | from lxml import html, etree |
| 40 | import pycurl |
| 41 | |
| 42 | REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', |
| 43 | 'heuristics', 'input') |
| 44 | NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt' |
| 45 | |
| 46 | FORM_LOCATION_COMMENT = 'Form Location: %s' |
| 47 | HTML_FILE_PREFIX = 'grabber-' |
| 48 | |
| 49 | MAX_REDIRECTIONS = 10 |
| 50 | |
| 51 | # Strings in a webpage that are indicative of a registration link. |
| 52 | LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account'] |
| 53 | |
| 54 | MAX_SAME_DOMAIN_URLS_NO = 30 |
| 55 | MAX_TOTAL_URLS_PER_DOMAIN = 300 |
| 56 | MAX_OPEN_FILES_NO = 500 |
| 57 | |
| 58 | # URLs are selected for downloading with the following rules from the link |
| 59 | # lists, giving more weight to the links that contain a link clue. |
| 60 | CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10 |
| 61 | CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10 |
| 62 | SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10 |
| 63 | GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10 |
| 64 | |
| 65 | MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1 |
| 66 | |
| 67 | |
| 68 | class Retriever(object): |
| 69 | """Download, parse, and check if the web page contains a registration form. |
| 70 | |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 71 | The objects of this class has a one to one relation with the web pages. For |
| 72 | each page that is downloaded and parsed an object of this class is created. |
| 73 | Each Retriever object creates a curl object. This object is added to the curl |
| 74 | multi object of the crawler object so that the corresponding pages gets |
| 75 | downloaded. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 76 | """ |
| 77 | logger = logging.getLogger(__name__) |
| 78 | |
| 79 | def __init__(self, url, domain, cookie_file): |
| 80 | """Initializes a Retriever object. |
| 81 | |
| 82 | Args: |
| 83 | url: url to download page from. |
| 84 | domain: only links with this domain will be retrieved. |
| 85 | cookie_file: the name of a cookie file, needed for pages that use session |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 86 | cookies to change their contents. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 87 | """ |
| 88 | self._url = url |
| 89 | self._domain = domain |
| 90 | self._html_content = '' |
| 91 | |
| 92 | # Http links without clues from LINK_CLUES. |
| 93 | self._general_links = [] |
| 94 | # Http links that contain a clue from LINK_CLUES. |
| 95 | self._clues_general_links = [] |
| 96 | # Https links that do not contain any clues from LINK_CLUES. |
| 97 | self._secure_links = [] |
| 98 | # Https links that contain a clue from LINK_CLUES. |
| 99 | self._clues_secure_links = [] |
| 100 | self._cookie_file = cookie_file |
| 101 | self._curl_object = None |
| 102 | |
| 103 | def __del__(self): |
| 104 | """Cleans up before this object is destroyed. |
| 105 | |
| 106 | The function closes the corresponding curl object that does the downloading. |
| 107 | """ |
| 108 | if self._curl_object: |
| 109 | self._curl_object.close() |
| 110 | |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 111 | def _AddLink(self, link): |
| 112 | """Adds url |link|, if not already present, to the appropriate list. |
| 113 | |
| 114 | The link only gets added to the single list that is appopriate for it: |
| 115 | _secure_links, _general_links, _clues_secure_links or _clues_general_links. |
| 116 | |
| 117 | Args: |
| 118 | link: the url that is inserted to the appropriate links list. |
| 119 | """ |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 120 | # Handles sites with unicode URLs. |
| 121 | if isinstance(link, unicode): |
| 122 | # Encode in 'utf-8' to avoid the UnicodeEncodeError exception. |
| 123 | link = httplib2.iri2uri(link).encode('utf-8') |
| 124 | link_parsed = urlparse.urlparse(link) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 125 | link_lists = [self._clues_secure_links, self._secure_links, |
| 126 | self._clues_general_links, self._general_links] |
| 127 | # Checks that the registration page is within the domain. |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 128 | if (self._domain in link_parsed[1] and |
| 129 | all(link not in x for x in link_lists)): |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 130 | for clue in LINK_CLUES: |
| 131 | if clue in link.lower(): |
| 132 | if link_parsed[0].startswith('https'): |
| 133 | self._clues_secure_links.append(link) |
| 134 | return |
| 135 | else: |
| 136 | self._clues_general_links.append(link) |
| 137 | return |
| 138 | if link_parsed[0].startswith('https'): # No clues found in the link. |
| 139 | self._secure_links.append(link) |
| 140 | else: |
| 141 | self._general_links.append(link) |
| 142 | |
| 143 | def ParseAndGetLinks(self): |
| 144 | """Parses downloaded page and gets url link for non registration page. |
| 145 | |
| 146 | Checks if current page contains a registration page and if not it gets |
| 147 | the url links. If it is a registration page, it saves it in a file as |
| 148 | 'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT |
| 149 | and it returns True. Otherwise it returns False. |
| 150 | |
| 151 | Returns: |
| 152 | True if current page contains a registration form, and False otherwise. |
| 153 | |
| 154 | Raises: |
| 155 | IOError: When can't write to the file. |
| 156 | """ |
| 157 | if not self._domain: |
| 158 | self.logger.error('Error: self._domain was not set') |
| 159 | sys.exit(1) |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 160 | match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1', |
| 161 | self._html_content) |
| 162 | for group_list in match_list: |
| 163 | link = group_list[1] |
| 164 | if link.startswith('//'): |
| 165 | link = urlparse.urljoin(self._url, link) |
| 166 | self._AddLink(link) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 167 | try: |
| 168 | tree = html.fromstring(self._html_content, parser=html.HTMLParser()) |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 169 | except etree.LxmlError: |
| 170 | self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s', |
| 171 | self._url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 172 | return False |
| 173 | try: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 174 | body = tree.iter('body').next() |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 175 | except StopIteration: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 176 | self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s', |
| 177 | self._url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 178 | return False |
| 179 | |
| 180 | # Get a list of all input elements with attribute type='password' |
| 181 | password_elements = list(body.iterfind('.//input[@type="password"]')) |
| 182 | # Check for multiple password elements to distinguish between a login form |
| 183 | # and a registration form (Password field and Confirm Password field). |
| 184 | if password_elements and len(password_elements) >= 2: |
| 185 | form_elements = [] |
| 186 | for password_elem in password_elements: |
| 187 | form_elem = password_elem.xpath('ancestor::form[1]') |
| 188 | if not form_elem: |
| 189 | continue |
| 190 | if not form_elem[0] in form_elements: |
| 191 | form_elements.append(form_elem[0]) |
| 192 | else: |
| 193 | # Confirms that the page contains a registration form if two passwords |
| 194 | # are contained in the same form for form_elem[0]. |
| 195 | if not os.path.isdir(REGISTER_PAGE_DIR): |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 196 | os.makedirs(REGISTER_PAGE_DIR) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 197 | # Locate the HTML tag and insert the form location comment after it. |
| 198 | html_tag = tree.iter('html').next() |
| 199 | comment = etree.Comment(FORM_LOCATION_COMMENT % self._url) |
| 200 | html_tag.insert(0, comment) |
| 201 | # Create a new file and save the HTML registration page code. |
| 202 | f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX, |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 203 | self._domain), 'w') |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 204 | try: |
| 205 | f.write(html.tostring(tree, pretty_print=True)) |
| 206 | except IOError as e: |
| 207 | self.logger.error('Error: %s', e) |
| 208 | raise |
| 209 | finally: |
| 210 | f.close() |
| 211 | return True # Registration page found. |
| 212 | # Indicates page is not a registration page and links must be parsed. |
| 213 | link_elements = list(body.iter('a')) |
| 214 | for link_elem in link_elements: |
| 215 | link = link_elem.get('href') |
| 216 | if not link or '#' == link[0]: |
| 217 | continue |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 218 | link = urlparse.urljoin(self._url, link) |
| 219 | link_parsed = urlparse.urlparse(link) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 220 | if not link_parsed[0].startswith('http'): |
| 221 | continue |
| 222 | self._AddLink(link) |
| 223 | return False # Registration page not found. |
| 224 | |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 225 | def InitRequestHead(self): |
| 226 | """Initializes curl object for a HEAD request. |
| 227 | |
| 228 | A HEAD request is initiated so that we can check from the headers if this is |
| 229 | a valid HTML file. If it is not a valid HTML file, then we do not initiate a |
| 230 | GET request, saving any unnecessary downloadings. |
| 231 | """ |
| 232 | self._curl_object = pycurl.Curl() |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 233 | self._curl_object.setopt(pycurl.URL, self._url) |
| 234 | # The following line fixes the GnuTLS package error that pycurl depends |
| 235 | # on for getting https pages. |
| 236 | self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 237 | self._curl_object.setopt(pycurl.FOLLOWLOCATION, True) |
| 238 | self._curl_object.setopt(pycurl.NOBODY, True) |
| 239 | self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False); |
| 240 | self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS) |
| 241 | self._curl_object.setopt(pycurl.FAILONERROR, False) |
| 242 | self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file) |
| 243 | self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file) |
| 244 | self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30) |
| 245 | self._curl_object.setopt(pycurl.TIMEOUT, 300) |
| 246 | self._curl_object.setopt(pycurl.NOSIGNAL, 1) |
| 247 | |
| 248 | def InitRequestGet(self): |
| 249 | """Initializes curl object for a GET request. |
| 250 | |
| 251 | This is called only for valid HTML files. The Pycurl makes a GET request. |
| 252 | The page begins to download, but since not all the data of the pages comes |
| 253 | at once. When some of the data on the page is downloaded Pycurl will put |
| 254 | this data in the buffer. The data is appended to the end of the page until |
| 255 | everything is downloaded. |
| 256 | """ |
| 257 | self._curl_object.setopt(pycurl.NOBODY, False) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 258 | self._curl_object.setopt( |
| 259 | pycurl.WRITEFUNCTION, lambda buff: setattr( |
| 260 | self, '_html_content', self._html_content + buff)) |
| 261 | |
| 262 | def Download(self): |
| 263 | """Downloads the self._url page. |
| 264 | |
| 265 | It first does a HEAD request and then it proceeds to a GET request. |
| 266 | It uses a curl object for a single download. This function is called only |
| 267 | once for the initial url of a site when we still don't have more urls from a |
| 268 | domain. |
| 269 | |
| 270 | Returns: |
| 271 | True, if the downloaded page is valid HTML code, or False otherwise. |
| 272 | """ |
| 273 | self.InitRequestHead() |
| 274 | try: |
| 275 | self._curl_object.perform() |
| 276 | except pycurl.error as e: |
| 277 | self.logger.error('Error: %s, url: %s', e, self._url) |
| 278 | return False |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 279 | self._url = urlparse.urljoin( |
| 280 | self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL)) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 281 | content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE) |
| 282 | if content_type and ('text/html' in content_type.lower()): |
| 283 | self.InitRequestGet() |
| 284 | try: |
| 285 | self._curl_object.perform() |
| 286 | except pycurl.error as e: |
| 287 | self.logger.error('Error: %s, url: %s', e, self._url) |
| 288 | return False |
| 289 | return True |
| 290 | else: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 291 | self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 292 | return False |
| 293 | |
| 294 | def Run(self): |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 295 | """Called only once for the initial url when we do not have more urls. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 296 | |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 297 | Downloads the originally-specified site url, parses it and gets the links. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 298 | |
| 299 | Returns: |
| 300 | True, if a registration page is found, and False otherwise. |
| 301 | """ |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 302 | if self.Download(): |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 303 | if not self._domain: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 304 | url_parsed = urlparse.urlparse(self._url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 305 | self._domain = url_parsed[1] |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 306 | if self._domain.startswith('www'): |
| 307 | self._domain = '.'.join(self._domain.split('.')[1:]) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 308 | if self.ParseAndGetLinks(): |
| 309 | return True |
| 310 | return False |
| 311 | |
| 312 | |
| 313 | class Crawler(object): |
| 314 | """Crawls a site until a registration page is found or max level is reached. |
| 315 | |
| 316 | Creates, uses and destroys Retriever objects. Creates a cookie temp file |
| 317 | needed for session cookies. It keeps track of 'visited links' and |
| 318 | 'links to visit' of the site. To do this it uses the links discovered from |
| 319 | each Retriever object. Use Run() to crawl the site. |
| 320 | """ |
| 321 | try: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 322 | signal.signal(signal.SIGPIPE, signal.SIG_IGN) |
| 323 | except ImportError: |
| 324 | pass |
| 325 | logger = logging.getLogger(__name__) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 326 | |
| 327 | def __init__(self, url, logging_level=None): |
| 328 | """Init crawler URL, links lists, logger, and creates a cookie temp file. |
| 329 | |
| 330 | The cookie temp file is needed for session cookies. |
| 331 | |
| 332 | Args: |
| 333 | url: the initial "seed" url of the site. |
| 334 | logging_level: the desired verbosity level, default is None. |
| 335 | """ |
| 336 | if logging_level: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 337 | self.logger.setLevel(logging_level) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 338 | |
| 339 | self.url_error = False |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 340 | url_parsed = urlparse.urlparse(url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 341 | if not url_parsed[0].startswith('http'): |
| 342 | self.logger.error( |
| 343 | 'Error: "%s" does not begin with http:// or https://', url) |
| 344 | self.url_error = True |
| 345 | return |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 346 | # Example: if url is 'http://www.example.com?name=john' then value [1] or |
| 347 | # network location is 'www.example.com'. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 348 | if not url_parsed[1]: |
| 349 | self.logger.error('Error: "%s" is not a valid url', url) |
| 350 | self.url_error = True |
| 351 | return |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 352 | self._url = url |
| 353 | self._domain = '' |
| 354 | # Http links that contain a clue from LINK_CLUES. |
| 355 | self._clues_general_links = [] |
| 356 | # Http links that do not contain any clue from LINK_CLUES. |
| 357 | self._general_links = [] |
| 358 | # Https links that contain a clue from LINK_CLUES. |
| 359 | self._clues_secure_links = [] |
| 360 | # Https links that do not contain any clue from LINK_CLUES. |
| 361 | self._secure_links = [] |
| 362 | # All links downloaded and parsed so far. |
| 363 | self._links_visited = [] |
| 364 | self._retrievers_list = [] |
| 365 | self._cookie_file = tempfile.NamedTemporaryFile( |
| 366 | suffix='.cookie', delete=False) |
| 367 | self._cookie_file.close() |
| 368 | self._cookie_file = self._cookie_file.name # Keep only the filename. |
| 369 | |
| 370 | def __del__(self): |
| 371 | """Deletes cookie file when Crawler instances are destroyed.""" |
| 372 | if hasattr(self, '_cookie_file'): |
| 373 | self.logger.info('Deleting cookie file %s ...', self._cookie_file) |
| 374 | os.unlink(self._cookie_file) |
| 375 | |
| 376 | def _MultiPerform(self, curl_multi_object): |
| 377 | """Performs concurrent downloads using a CurlMulti object. |
| 378 | |
| 379 | Args: |
| 380 | curl_multi_object: a curl object that downloads multiple pages |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 381 | concurrently. The class of this object is |pycurl.CurlMulti|. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 382 | """ |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 383 | # Following code uses the example from section for the CurlMulti object |
| 384 | # at http://pycurl.sourceforge.net/doc/curlmultiobject.html. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 385 | while True: |
| 386 | ret, no_handles = curl_multi_object.perform() |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 387 | if ret != pycurl.E_CALL_MULTI_PERFORM: |
| 388 | break |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 389 | while no_handles: |
| 390 | curl_multi_object.select(1.0) |
| 391 | while True: |
| 392 | ret, no_handles = curl_multi_object.perform() |
| 393 | if ret != pycurl.E_CALL_MULTI_PERFORM: |
| 394 | break |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 395 | |
| 396 | def _GetLinksPages(self, curl_multi_object): |
| 397 | """Downloads many pages concurrently using a CurlMulti Object. |
| 398 | |
| 399 | Creates many Retriever objects and adds them to a list. The constant |
| 400 | MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded |
| 401 | concurrently from the same domain using the pycurl multi object. It's |
| 402 | currently set to 30 URLs. These URLs are taken from the links lists, which |
| 403 | are from csl, gcl, sl, and gl. The rules define how many URLs are taken from |
| 404 | each list during each iteration. |
| 405 | |
| 406 | Example of the rules: |
| 407 | 3/10 from csl results in 9 URLs |
| 408 | 3/10 from cgl results in 9 URLs |
| 409 | 2/10 from sl results in 6 URLs |
| 410 | 2/10 from gl results in 6 URLs |
| 411 | |
| 412 | Adding up the above URLs gives 30 URLs that can be downloaded concurrently. |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 413 | If these lists have fewer items than the defined rules, such as if a site |
| 414 | does not contain any secure links, then csl and sl lists will be of 0 length |
| 415 | and only 15 pages would be downloaded concurrently from the same domain. |
| 416 | |
| 417 | Since 30 URLs can be handled concurrently, the number of links taken from |
| 418 | other lists can be increased. This means that we can take 24 links from the |
| 419 | cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less |
| 420 | than 24 links, e.g. there are only 21 links, then only 9 links may be taken |
| 421 | from gl so ) + 21 + 0 + 9 = 30. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 422 | |
| 423 | Args: |
| 424 | curl_multi_object: Each Retriever object has a curl object which is |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 425 | added to the CurlMulti Object. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 426 | """ |
| 427 | self._retrievers_list = [] |
| 428 | |
| 429 | csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links)) |
| 430 | cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links)) |
| 431 | sl_no = min(SECURE_LINKS_NO, len(self._secure_links)) |
| 432 | gl_no = min(GENERAL_LINKS_NO, len(self._general_links)) |
| 433 | |
| 434 | # If some links within the list have fewer items than needed, the missing |
| 435 | # links will be taken by the following priority: csl, cgl, sl, gl. |
| 436 | # c: clues, s: secure, g: general, l: list. |
| 437 | spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no) |
| 438 | if spare_links > 0: |
| 439 | csl_no = min(csl_no + spare_links, len(self._clues_secure_links)) |
| 440 | spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no) |
| 441 | if spare_links > 0: |
| 442 | cgl_no = min(cgl_no + spare_links, len(self._clues_general_links)) |
| 443 | spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no) |
| 444 | if spare_links > 0: |
| 445 | sl_no = min(sl_no + spare_links, len(self._secure_links)) |
| 446 | spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no) |
| 447 | if spare_links > 0: |
| 448 | gl_no = min(gl_no + spare_links, len(self._general_links)) |
| 449 | |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 450 | for no_of_links, links in [ |
| 451 | (csl_no, self._clues_secure_links), |
| 452 | (sl_no, self._secure_links), |
| 453 | (cgl_no, self._clues_general_links), |
| 454 | (gl_no, self._general_links)]: |
| 455 | for i in xrange(no_of_links): |
| 456 | if not links: |
| 457 | break |
| 458 | url = links.pop(0) |
| 459 | self._links_visited.append(url) |
| 460 | r = Retriever(url, self._domain, self._cookie_file) |
| 461 | r.InitRequestHead() |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 462 | curl_multi_object.add_handle(r._curl_object) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 463 | self._retrievers_list.append(r) |
| 464 | |
| 465 | if self._retrievers_list: |
| 466 | try: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 467 | self._MultiPerform(curl_multi_object) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 468 | except pycurl.error as e: |
| 469 | self.logger.error('Error: %s, url: %s', e, self._url) |
| 470 | finally: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 471 | for r in self._retrievers_list: |
| 472 | curl_multi_object.remove_handle(r._curl_object) |
| 473 | # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing |
| 474 | # items from the iterated list. |
| 475 | for r in self._retrievers_list[:]: |
| 476 | r._url = urlparse.urljoin(r._url, r._curl_object.getinfo( |
| 477 | pycurl.EFFECTIVE_URL)) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 478 | content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE) |
| 479 | if content_type and ('text/html' in content_type.lower()): |
| 480 | r.InitRequestGet() |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 481 | curl_multi_object.add_handle(r._curl_object) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 482 | else: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 483 | self._retrievers_list.remove(r) |
| 484 | self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 485 | if self._retrievers_list: |
| 486 | try: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 487 | self._MultiPerform(curl_multi_object) |
| 488 | except pycurl.error as e: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 489 | self.logger.error('Error: %s, url: %s', e, self._url) |
| 490 | finally: |
| 491 | for r in self._retrievers_list: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 492 | curl_multi_object.remove_handle(r._curl_object) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 493 | self.logger.info('Downloaded: %s', r._url) |
| 494 | |
| 495 | def _LogRegPageFound(self, retriever): |
| 496 | """Display logging for registration page found. |
| 497 | |
| 498 | Args: |
| 499 | retriever: The object that has retrieved the page. |
| 500 | """ |
| 501 | self.logger.info('\t##############################################') |
| 502 | self.logger.info('\t### %s ###', retriever._domain) |
| 503 | self.logger.info('\t##############################################') |
| 504 | self.logger.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!') |
| 505 | self.logger.info('\t%s', retriever._url) |
| 506 | self.logger.info('\t##############################################') |
| 507 | |
| 508 | def _GetNewLinks(self, retriever): |
| 509 | """Appends new links discovered by each retriever to the appropriate lists. |
| 510 | |
| 511 | Links are copied to the links list of the crawler object, which holds all |
| 512 | the links found from all retrievers that the crawler object created. The |
| 513 | Crawler object exists as far as a specific site is examined and the |
| 514 | Retriever object exists as far as a page of this site is examined. |
| 515 | |
| 516 | Args: |
| 517 | retriever: a temporary object that downloads a specific page, parses the |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 518 | content and gets the page's href link. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 519 | """ |
| 520 | for link in retriever._clues_secure_links: |
| 521 | if (not link in self._clues_secure_links and |
| 522 | not link in self._links_visited): |
| 523 | self._clues_secure_links.append(link) |
| 524 | for link in retriever._secure_links: |
| 525 | if (not link in self._secure_links and |
| 526 | not link in self._links_visited): |
| 527 | self._secure_links.append(link) |
| 528 | for link in retriever._clues_general_links: |
| 529 | if (not link in self._clues_general_links and |
| 530 | not link in self._links_visited): |
| 531 | self._clues_general_links.append(link) |
| 532 | for link in retriever._general_links: |
| 533 | if (not link in self._general_links and |
| 534 | not link in self._links_visited): |
| 535 | self._general_links.append(link) |
| 536 | |
| 537 | def Run(self): |
| 538 | """Runs the Crawler. |
| 539 | |
| 540 | Creates a Retriever object and calls its run method to get the first links, |
| 541 | and then uses CurlMulti object and creates many Retriever objects to get |
| 542 | the subsequent pages. |
| 543 | |
| 544 | The number of pages (=Retriever objs) created each time is restricted by |
| 545 | MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download |
| 546 | and parse their pages, we do the same again. The number of total pages |
| 547 | visited is kept in urls_visited. |
| 548 | If no registration page is found, the Crawler object will give up its try |
| 549 | after MAX_TOTAL_URLS_PER_DOMAIN is reached. |
| 550 | |
| 551 | Returns: |
| 552 | True is returned if registration page is found, or False otherwise. |
| 553 | """ |
| 554 | reg_page_found = False |
| 555 | if self.url_error: |
| 556 | return False |
| 557 | r = Retriever(self._url, self._domain, self._cookie_file) |
| 558 | if r.Run(): |
| 559 | self._LogRegPageFound(r) |
| 560 | reg_page_found = True |
| 561 | else: |
| 562 | self._url = r._url |
| 563 | self._domain = r._domain |
| 564 | self.logger.info('url to crawl: %s', self._url) |
| 565 | self.logger.info('domain: %s', self._domain) |
| 566 | self._links_visited.append(r._url) |
| 567 | self._GetNewLinks(r) |
| 568 | urls_visited = 1 |
| 569 | while True: |
| 570 | if (not (self._clues_secure_links or self._secure_links or |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 571 | self._clues_general_links or self._general_links) or |
| 572 | urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN): |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 573 | break # Registration page not found. |
| 574 | m = pycurl.CurlMulti() |
| 575 | self._GetLinksPages(m) |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 576 | urls_visited += len(self._retrievers_list) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 577 | self.logger.info('\t<----- URLs visited for domain "%s": %d ----->', |
| 578 | self._domain, urls_visited) |
| 579 | for r in self._retrievers_list: |
| 580 | if r.ParseAndGetLinks(): |
| 581 | self._LogRegPageFound(r) |
| 582 | reg_page_found = True |
| 583 | break |
| 584 | else: |
| 585 | self.logger.info('parsed: %s', r._url) |
| 586 | self._GetNewLinks(r) |
| 587 | m.close() |
| 588 | if reg_page_found: |
| 589 | break |
| 590 | while self._retrievers_list: |
| 591 | r = self._retrievers_list.pop() |
| 592 | return reg_page_found |
| 593 | |
| 594 | |
| 595 | class WorkerThread(threading.Thread): |
| 596 | """Creates a new thread of execution.""" |
| 597 | def __init__(self, url): |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 598 | """Creates _url and page_found attri to populate urls_with_no_reg_page file. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 599 | |
| 600 | Used after thread's termination for the creation of a file with a list of |
| 601 | the urls for which a registration page wasn't found. |
| 602 | |
| 603 | Args: |
| 604 | url: will be used as an argument to create a Crawler object later. |
| 605 | """ |
| 606 | threading.Thread.__init__(self) |
| 607 | self._url = url |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 608 | self.page_found = False |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 609 | |
| 610 | def run(self): |
| 611 | """Execution of thread creates a Crawler object and runs it. |
| 612 | |
| 613 | Caution: this function name should not be changed to 'Run' or any other |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 614 | names because it is overriding the 'run' method of the 'threading.Thread' |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 615 | class. Otherwise it will never be called. |
| 616 | """ |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 617 | self.page_found = Crawler(self._url).Run() |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 618 | |
| 619 | |
| 620 | class ThreadedCrawler(object): |
| 621 | """Calls the Run function of WorkerThread which creates & runs a Crawler obj. |
| 622 | |
| 623 | The crawler object runs concurrently, examining one site each. |
| 624 | """ |
| 625 | logger = logging.getLogger(__name__) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 626 | |
| 627 | def __init__(self, urls_file, logging_level=None): |
| 628 | """Creates threaded Crawler objects. |
| 629 | |
| 630 | Args: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 631 | urls_file: a text file containing a URL in each line. |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 632 | logging_level: verbosity level, default is None. |
| 633 | |
| 634 | Raises: |
| 635 | IOError: If cannot find URLs from the list. |
| 636 | """ |
| 637 | if logging_level: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 638 | self.logger.setLevel(logging_level) |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 639 | |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 640 | self._urls_list = [] |
| 641 | f = open(urls_file) |
| 642 | try: |
| 643 | for url in f.readlines(): |
| 644 | url = url.strip() |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 645 | if not urlparse.urlparse(url)[0].startswith('http'): |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 646 | self.logger.info( |
| 647 | '%s: skipping this (does not begin with "http://")', url) |
| 648 | continue |
| 649 | self._urls_list.append(url) |
| 650 | except IOError as e: |
| 651 | self.logger.error('Error: %s', e) |
| 652 | raise |
| 653 | finally: |
| 654 | f.close() |
| 655 | if not self._urls_list: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 656 | error_msg = 'No URLs were found.' |
| 657 | self.logger.error('ERROR: %s', error_msg) |
| 658 | raise IOError(error_msg) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 659 | |
| 660 | def Run(self): |
| 661 | """Runs Crawler objects using python threads. |
| 662 | |
| 663 | Number of concurrent threads is restricted to MAX_ALLOWED_THREADS. |
| 664 | |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 665 | Returns: |
| 666 | The number of registration pages found. -1 if no URLs are given. |
| 667 | |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 668 | Raises: |
| 669 | OSError: When creating the same directory that already exists. |
| 670 | """ |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 671 | if self._urls_list: |
| 672 | allThreads = [] |
| 673 | # originalNumThreads is the number of threads just before the |
| 674 | # ThreadedCrawler starts creating new threads. As a standalone script it |
| 675 | # will be 1. |
| 676 | originalNumThreads = threading.active_count() |
| 677 | for url in self._urls_list: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 678 | self.logger.info('URL fed to a crawler thread: %s', url) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 679 | t = WorkerThread(url) |
| 680 | t.start() |
| 681 | allThreads.append(t) |
| 682 | while threading.active_count() >= ( |
| 683 | MAX_ALLOWED_THREADS + originalNumThreads): |
| 684 | time.sleep(.4) |
| 685 | while threading.active_count() > originalNumThreads: |
| 686 | time.sleep(.4) |
| 687 | self.logger.info('----------------') |
| 688 | self.logger.info('--- FINISHED ---') |
| 689 | self.logger.info('----------------') |
| 690 | urls_no = 0 |
| 691 | urls_not_found_no = 0 |
| 692 | not_file_name = os.path.join( |
| 693 | REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME) |
| 694 | not_file_dir = os.path.dirname(not_file_name) |
| 695 | try: |
| 696 | os.makedirs(not_file_dir) |
| 697 | except OSError as e: |
| 698 | if e.errno != errno.EEXIST: |
| 699 | raise |
| 700 | fnot = open(not_file_name, 'wb') |
| 701 | try: |
| 702 | for t in sorted(allThreads, key=lambda t: t._url): |
| 703 | urls_no += 1 |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 704 | if not t.page_found: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 705 | urls_not_found_no += 1 |
| 706 | fnot.write('%s' % t._url) |
| 707 | fnot.write(os.linesep) |
| 708 | except IOError as e: |
| 709 | self.logger.error('Error: %s', e) |
| 710 | finally: |
| 711 | fnot.close() |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 712 | self.logger.info('Total number of URLs given: %d\n', urls_no) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 713 | self.logger.info( |
| 714 | 'Registration pages found: %d\n', (urls_no - urls_not_found_no)) |
| 715 | self.logger.info( |
| 716 | 'URLs that did not return a registration page: %d\n', |
| 717 | urls_not_found_no) |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 718 | return urls_no - urls_not_found_no |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 719 | else: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 720 | self.logger.error('Error: no URLs were found.') |
| 721 | return -1 |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 722 | |
| 723 | |
| 724 | def main(): |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 725 | usage = 'usage: %prog [options] single_url_or_urls_filename' |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 726 | parser = optparse.OptionParser(usage) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 727 | parser.add_option( |
| 728 | '-l', '--log_level', metavar='LOG_LEVEL', default='error', |
| 729 | help='LOG_LEVEL: debug, info, warning or error [default: %default]') |
| 730 | |
| 731 | (options, args) = parser.parse_args() |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 732 | options.log_level = options.log_level.upper() |
| 733 | if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']: |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 734 | print 'Wrong log_level argument.' |
| 735 | parser.print_help() |
[email protected] | 2fac375 | 2011-11-27 20:56:51 | [diff] [blame] | 736 | return 1 |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 737 | options.log_level = getattr(logging, options.log_level) |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 738 | |
| 739 | if len(args) != 1: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 740 | parser.error('Wrong number of arguments.') |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 741 | |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 742 | logger = logging.getLogger(__name__) |
| 743 | if options.log_level: |
| 744 | console = logging.StreamHandler() |
| 745 | logger.addHandler(console) |
| 746 | logger.setLevel(options.log_level) |
| 747 | |
| 748 | arg_is_a_file = os.path.isfile(args[0]) |
| 749 | if arg_is_a_file: |
| 750 | CrawlerClass = ThreadedCrawler |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 751 | else: |
[email protected] | 732dd43b | 2011-06-29 18:51:59 | [diff] [blame] | 752 | CrawlerClass = Crawler |
| 753 | t0 = datetime.datetime.now() |
| 754 | c = CrawlerClass(args[0], options.log_level) |
| 755 | c.Run() |
| 756 | if not arg_is_a_file and c.url_error: |
| 757 | logger.error( |
| 758 | 'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0]) |
| 759 | t1 = datetime.datetime.now() |
| 760 | delta_t = t1 - t0 |
| 761 | logger.info('Started at: %s\n', t0) |
| 762 | logger.info('Ended at: %s\n', t1) |
| 763 | logger.info('Total execution time: %s\n', delta_t) |
[email protected] | 2fac375 | 2011-11-27 20:56:51 | [diff] [blame] | 764 | return 0 |
[email protected] | 5883af98 | 2011-04-29 19:15:53 | [diff] [blame] | 765 | |
| 766 | |
| 767 | if __name__ == "__main__": |
[email protected] | 2fac375 | 2011-11-27 20:56:51 | [diff] [blame] | 768 | sys.exit(main()) |