Blame - chrome/tools/webforms_aggregator.py - chromium/src

[email protected]

2fac375

2011-11-27 20:56:51

[diff] [blame]

1

#!/usr/bin/env python

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

2

3

# Use of this source code is governed by a BSD-style license that can be

4

# found in the LICENSE file.

5

6

"""Downloads web pages with fillable forms after parsing through a set of links.

7

8

Used for collecting web pages with forms. Used as a standalone script.

9

This script assumes that it's run from within the same directory in which it's

10

checked into. If this script were to be run elsewhere then the path for

11

REGISTER_PAGE_DIR needs to be changed.

12

13

This script assumes that third party modules are installed:

14

httplib2, lxml, pycurl.

15

16

Usage: webforms_aggregator.py [options] [single url or file containing urls]

17

18

Options:

19

-l LOG_LEVEL, --log_level LOG_LEVEL

20

LOG_LEVEL: debug, info, warning or error [default: error]

21

-h, --help show this help message and exit

"""

import datetime

import errno

import logging

2011-06-29 18:51:59

[diff] [blame]

27

import optparse

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

import os

import re

2011-06-29 18:51:59

[diff] [blame]

30

# Needed in Linux so that PyCurl does not throw a segmentation fault.

31

import signal

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

import sys

import tempfile

import threading

import time

2011-06-29 18:51:59

[diff] [blame]

36

import urlparse

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

37

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

38

import httplib2

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

39

from lxml import html, etree

40

import pycurl

41

42

REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',

43

'heuristics', 'input')

44

NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'

45

46

FORM_LOCATION_COMMENT = 'Form Location: %s'

47

HTML_FILE_PREFIX = 'grabber-'

48

49

MAX_REDIRECTIONS = 10

50

51

# Strings in a webpage that are indicative of a registration link.

52

LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']

53

54

MAX_SAME_DOMAIN_URLS_NO = 30

55

MAX_TOTAL_URLS_PER_DOMAIN = 300

56

MAX_OPEN_FILES_NO = 500

57

58

# URLs are selected for downloading with the following rules from the link

59

# lists, giving more weight to the links that contain a link clue.

60

CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10

61

CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10

62

SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10

63

GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10

64

65

MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1

66

67

68

class Retriever(object):

69

"""Download, parse, and check if the web page contains a registration form.

70

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

71

The objects of this class has a one to one relation with the web pages. For

72

each page that is downloaded and parsed an object of this class is created.

73

Each Retriever object creates a curl object. This object is added to the curl

74

multi object of the crawler object so that the corresponding pages gets

75

downloaded.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

76

"""

77

logger = logging.getLogger(__name__)

78

79

def __init__(self, url, domain, cookie_file):

80

"""Initializes a Retriever object.

81

82

Args:

83

url: url to download page from.

84

domain: only links with this domain will be retrieved.

85

cookie_file: the name of a cookie file, needed for pages that use session

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

86

cookies to change their contents.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

87

"""

88

self._url = url

89

self._domain = domain

90

self._html_content = ''

91

92

# Http links without clues from LINK_CLUES.

93

self._general_links = []

94

# Http links that contain a clue from LINK_CLUES.

95

self._clues_general_links = []

96

# Https links that do not contain any clues from LINK_CLUES.

97

self._secure_links = []

98

# Https links that contain a clue from LINK_CLUES.

99

self._clues_secure_links = []

100

self._cookie_file = cookie_file

101

self._curl_object = None

102

103

def __del__(self):

104

"""Cleans up before this object is destroyed.

105

106

The function closes the corresponding curl object that does the downloading.

107

"""

108

if self._curl_object:

109

self._curl_object.close()

110

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

111

def _AddLink(self, link):

112

"""Adds url |link|, if not already present, to the appropriate list.

113

114

The link only gets added to the single list that is appopriate for it:

115

_secure_links, _general_links, _clues_secure_links or _clues_general_links.

116

117

Args:

118

link: the url that is inserted to the appropriate links list.

119

"""

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

120

# Handles sites with unicode URLs.

121

if isinstance(link, unicode):

122

# Encode in 'utf-8' to avoid the UnicodeEncodeError exception.

123

link = httplib2.iri2uri(link).encode('utf-8')

124

link_parsed = urlparse.urlparse(link)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

125

link_lists = [self._clues_secure_links, self._secure_links,

126

self._clues_general_links, self._general_links]

127

# Checks that the registration page is within the domain.

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

128

if (self._domain in link_parsed[1] and

129

all(link not in x for x in link_lists)):

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

130

for clue in LINK_CLUES:

131

if clue in link.lower():

132

if link_parsed[0].startswith('https'):

133

self._clues_secure_links.append(link)

134

return

135

else:

136

self._clues_general_links.append(link)

137

return

138

if link_parsed[0].startswith('https'): # No clues found in the link.

139

self._secure_links.append(link)

140

else:

141

self._general_links.append(link)

142

143

def ParseAndGetLinks(self):

144

"""Parses downloaded page and gets url link for non registration page.

145

146

Checks if current page contains a registration page and if not it gets

147

the url links. If it is a registration page, it saves it in a file as

148

'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT

149

and it returns True. Otherwise it returns False.

150

151

Returns:

152

True if current page contains a registration form, and False otherwise.

153

154

Raises:

155

IOError: When can't write to the file.

156

"""

157

if not self._domain:

158

self.logger.error('Error: self._domain was not set')

159

sys.exit(1)

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

160

match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1',

161

self._html_content)

162

for group_list in match_list:

163

link = group_list[1]

164

if link.startswith('//'):

165

link = urlparse.urljoin(self._url, link)

166

self._AddLink(link)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

167

try:

168

tree = html.fromstring(self._html_content, parser=html.HTMLParser())

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

169

except etree.LxmlError:

170

self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s',

171

self._url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

return False

try:

2011-06-29 18:51:59

[diff] [blame]

174

body = tree.iter('body').next()

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

175

except StopIteration:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

176

self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s',

177

self._url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

178

return False

179

180

# Get a list of all input elements with attribute type='password'

181

password_elements = list(body.iterfind('.//input[@type="password"]'))

182

# Check for multiple password elements to distinguish between a login form

183

# and a registration form (Password field and Confirm Password field).

184

if password_elements and len(password_elements) >= 2:

185

form_elements = []

186

for password_elem in password_elements:

187

form_elem = password_elem.xpath('ancestor::form[1]')

188

if not form_elem:

189

continue

190

if not form_elem[0] in form_elements:

191

form_elements.append(form_elem[0])

192

else:

193

# Confirms that the page contains a registration form if two passwords

194

# are contained in the same form for form_elem[0].

195

if not os.path.isdir(REGISTER_PAGE_DIR):

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

196

os.makedirs(REGISTER_PAGE_DIR)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

197

# Locate the HTML tag and insert the form location comment after it.

198

html_tag = tree.iter('html').next()

199

comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)

200

html_tag.insert(0, comment)

201

# Create a new file and save the HTML registration page code.

202

f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

203

self._domain), 'w')

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

204

try:

205

f.write(html.tostring(tree, pretty_print=True))

206

except IOError as e:

207

self.logger.error('Error: %s', e)

raise

finally:

f.close()

return True # Registration page found.

212

# Indicates page is not a registration page and links must be parsed.

213

link_elements = list(body.iter('a'))

214

for link_elem in link_elements:

215

link = link_elem.get('href')

216

if not link or '#' == link[0]:

217

continue

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

218

link = urlparse.urljoin(self._url, link)

219

link_parsed = urlparse.urlparse(link)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

220

if not link_parsed[0].startswith('http'):

221

continue

222

self._AddLink(link)

223

return False # Registration page not found.

224

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

225

def InitRequestHead(self):

226

"""Initializes curl object for a HEAD request.

227

228

A HEAD request is initiated so that we can check from the headers if this is

229

a valid HTML file. If it is not a valid HTML file, then we do not initiate a

230

GET request, saving any unnecessary downloadings.

231

"""

232

self._curl_object = pycurl.Curl()

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

233

self._curl_object.setopt(pycurl.URL, self._url)

234

# The following line fixes the GnuTLS package error that pycurl depends

235

# on for getting https pages.

236

self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

237

self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)

238

self._curl_object.setopt(pycurl.NOBODY, True)

239

self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);

240

self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)

241

self._curl_object.setopt(pycurl.FAILONERROR, False)

242

self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)

243

self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)

244

self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)

245

self._curl_object.setopt(pycurl.TIMEOUT, 300)

246

self._curl_object.setopt(pycurl.NOSIGNAL, 1)

247

248

def InitRequestGet(self):

249

"""Initializes curl object for a GET request.

250

251

This is called only for valid HTML files. The Pycurl makes a GET request.

252

The page begins to download, but since not all the data of the pages comes

253

at once. When some of the data on the page is downloaded Pycurl will put

254

this data in the buffer. The data is appended to the end of the page until

255

everything is downloaded.

256

"""

257

self._curl_object.setopt(pycurl.NOBODY, False)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

258

self._curl_object.setopt(

259

pycurl.WRITEFUNCTION, lambda buff: setattr(

260

self, '_html_content', self._html_content + buff))

261

262

def Download(self):

263

"""Downloads the self._url page.

264

265

It first does a HEAD request and then it proceeds to a GET request.

266

It uses a curl object for a single download. This function is called only

267

once for the initial url of a site when we still don't have more urls from a

domain.

Returns:

True, if the downloaded page is valid HTML code, or False otherwise.

272

"""

273

self.InitRequestHead()

274

try:

275

self._curl_object.perform()

276

except pycurl.error as e:

277

self.logger.error('Error: %s, url: %s', e, self._url)

278

return False

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

279

self._url = urlparse.urljoin(

280

self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL))

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

281

content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)

282

if content_type and ('text/html' in content_type.lower()):

283

self.InitRequestGet()

284

try:

285

self._curl_object.perform()

286

except pycurl.error as e:

287

self.logger.error('Error: %s, url: %s', e, self._url)

return False

return True

else:

2011-06-29 18:51:59

[diff] [blame]

291

self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

return False

def Run(self):

2011-06-29 18:51:59

[diff] [blame]

295

"""Called only once for the initial url when we do not have more urls.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

296

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

297

Downloads the originally-specified site url, parses it and gets the links.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

298

299

Returns:

300

True, if a registration page is found, and False otherwise.

301

"""

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

302

if self.Download():

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

303

if not self._domain:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

304

url_parsed = urlparse.urlparse(self._url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

305

self._domain = url_parsed[1]

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

306

if self._domain.startswith('www'):

307

self._domain = '.'.join(self._domain.split('.')[1:])

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

308

if self.ParseAndGetLinks():

return True

return False

class Crawler(object):

314

"""Crawls a site until a registration page is found or max level is reached.

315

316

Creates, uses and destroys Retriever objects. Creates a cookie temp file

317

needed for session cookies. It keeps track of 'visited links' and

318

'links to visit' of the site. To do this it uses the links discovered from

319

each Retriever object. Use Run() to crawl the site.

"""

try:

2011-04-29 19:15:53

[diff] [blame]

322

signal.signal(signal.SIGPIPE, signal.SIG_IGN)

323

except ImportError:

324

pass

325

logger = logging.getLogger(__name__)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

326

327

def __init__(self, url, logging_level=None):

328

"""Init crawler URL, links lists, logger, and creates a cookie temp file.

329

330

The cookie temp file is needed for session cookies.

331

332

Args:

333

url: the initial "seed" url of the site.

334

logging_level: the desired verbosity level, default is None.

"""

if logging_level:

2011-04-29 19:15:53

[diff] [blame]

337

self.logger.setLevel(logging_level)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

338

339

self.url_error = False

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

340

url_parsed = urlparse.urlparse(url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

341

if not url_parsed[0].startswith('http'):

342

self.logger.error(

343

'Error: "%s" does not begin with http:// or https://', url)

344

self.url_error = True

345

return

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

346

# Example: if url is 'http://www.example.com?name=john' then value [1] or

347

# network location is 'www.example.com'.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

348

if not url_parsed[1]:

349

self.logger.error('Error: "%s" is not a valid url', url)

350

self.url_error = True

351

return

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

352

self._url = url

353

self._domain = ''

354

# Http links that contain a clue from LINK_CLUES.

355

self._clues_general_links = []

356

# Http links that do not contain any clue from LINK_CLUES.

357

self._general_links = []

358

# Https links that contain a clue from LINK_CLUES.

359

self._clues_secure_links = []

360

# Https links that do not contain any clue from LINK_CLUES.

361

self._secure_links = []

362

# All links downloaded and parsed so far.

363

self._links_visited = []

364

self._retrievers_list = []

365

self._cookie_file = tempfile.NamedTemporaryFile(

366

suffix='.cookie', delete=False)

367

self._cookie_file.close()

368

self._cookie_file = self._cookie_file.name # Keep only the filename.

369

370

def __del__(self):

371

"""Deletes cookie file when Crawler instances are destroyed."""

372

if hasattr(self, '_cookie_file'):

373

self.logger.info('Deleting cookie file %s ...', self._cookie_file)

374

os.unlink(self._cookie_file)

375

376

def _MultiPerform(self, curl_multi_object):

377

"""Performs concurrent downloads using a CurlMulti object.

378

379

Args:

380

curl_multi_object: a curl object that downloads multiple pages

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

381

concurrently. The class of this object is |pycurl.CurlMulti|.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

382

"""

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

383

# Following code uses the example from section for the CurlMulti object

384

# at http://pycurl.sourceforge.net/doc/curlmultiobject.html.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

385

while True:

386

ret, no_handles = curl_multi_object.perform()

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

387

if ret != pycurl.E_CALL_MULTI_PERFORM:

388

break

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

389

while no_handles:

390

curl_multi_object.select(1.0)

391

while True:

392

ret, no_handles = curl_multi_object.perform()

393

if ret != pycurl.E_CALL_MULTI_PERFORM:

394

break

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

395

396

def _GetLinksPages(self, curl_multi_object):

397

"""Downloads many pages concurrently using a CurlMulti Object.

398

399

Creates many Retriever objects and adds them to a list. The constant

400

MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded

401

concurrently from the same domain using the pycurl multi object. It's

402

currently set to 30 URLs. These URLs are taken from the links lists, which

403

are from csl, gcl, sl, and gl. The rules define how many URLs are taken from

404

each list during each iteration.

405

406

Example of the rules:

407

3/10 from csl results in 9 URLs

408

3/10 from cgl results in 9 URLs

409

2/10 from sl results in 6 URLs

410

2/10 from gl results in 6 URLs

411

412

Adding up the above URLs gives 30 URLs that can be downloaded concurrently.

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

413

If these lists have fewer items than the defined rules, such as if a site

414

does not contain any secure links, then csl and sl lists will be of 0 length

415

and only 15 pages would be downloaded concurrently from the same domain.

416

417

Since 30 URLs can be handled concurrently, the number of links taken from

418

other lists can be increased. This means that we can take 24 links from the

419

cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less

420

than 24 links, e.g. there are only 21 links, then only 9 links may be taken

421

from gl so ) + 21 + 0 + 9 = 30.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

422

423

Args:

424

curl_multi_object: Each Retriever object has a curl object which is

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

425

added to the CurlMulti Object.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

426

"""

427

self._retrievers_list = []

428

429

csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))

430

cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))

431

sl_no = min(SECURE_LINKS_NO, len(self._secure_links))

432

gl_no = min(GENERAL_LINKS_NO, len(self._general_links))

433

434

# If some links within the list have fewer items than needed, the missing

435

# links will be taken by the following priority: csl, cgl, sl, gl.

436

# c: clues, s: secure, g: general, l: list.

437

spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)

438

if spare_links > 0:

439

csl_no = min(csl_no + spare_links, len(self._clues_secure_links))

440

spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)

441

if spare_links > 0:

442

cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))

443

spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)

444

if spare_links > 0:

445

sl_no = min(sl_no + spare_links, len(self._secure_links))

446

spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)

447

if spare_links > 0:

448

gl_no = min(gl_no + spare_links, len(self._general_links))

449

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

450

for no_of_links, links in [

451

(csl_no, self._clues_secure_links),

452

(sl_no, self._secure_links),

453

(cgl_no, self._clues_general_links),

454

(gl_no, self._general_links)]:

455

for i in xrange(no_of_links):

if not links:

break

url = links.pop(0)

self._links_visited.append(url)

460

r = Retriever(url, self._domain, self._cookie_file)

461

r.InitRequestHead()

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

462

curl_multi_object.add_handle(r._curl_object)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

463

self._retrievers_list.append(r)

464

465

if self._retrievers_list:

466

try:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

467

self._MultiPerform(curl_multi_object)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

468

except pycurl.error as e:

469

self.logger.error('Error: %s, url: %s', e, self._url)

470

finally:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

471

for r in self._retrievers_list:

472

curl_multi_object.remove_handle(r._curl_object)

473

# |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing

474

# items from the iterated list.

475

for r in self._retrievers_list[:]:

476

r._url = urlparse.urljoin(r._url, r._curl_object.getinfo(

477

pycurl.EFFECTIVE_URL))

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

478

content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)

479

if content_type and ('text/html' in content_type.lower()):

480

r.InitRequestGet()

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

481

curl_multi_object.add_handle(r._curl_object)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

482

else:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

483

self._retrievers_list.remove(r)

484

self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

485

if self._retrievers_list:

486

try:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

487

self._MultiPerform(curl_multi_object)

488

except pycurl.error as e:

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

489

self.logger.error('Error: %s, url: %s', e, self._url)

490

finally:

491

for r in self._retrievers_list:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

492

curl_multi_object.remove_handle(r._curl_object)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

493

self.logger.info('Downloaded: %s', r._url)

494

495

def _LogRegPageFound(self, retriever):

496

"""Display logging for registration page found.

497

498

Args:

499

retriever: The object that has retrieved the page.

500

"""

501

self.logger.info('\t##############################################')

502

self.logger.info('\t### %s ###', retriever._domain)

503

self.logger.info('\t##############################################')

504

self.logger.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!')

505

self.logger.info('\t%s', retriever._url)

506

self.logger.info('\t##############################################')

507

508

def _GetNewLinks(self, retriever):

509

"""Appends new links discovered by each retriever to the appropriate lists.

510

511

Links are copied to the links list of the crawler object, which holds all

512

the links found from all retrievers that the crawler object created. The

513

Crawler object exists as far as a specific site is examined and the

514

Retriever object exists as far as a page of this site is examined.

515

516

Args:

517

retriever: a temporary object that downloads a specific page, parses the

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

518

content and gets the page's href link.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

519

"""

520

for link in retriever._clues_secure_links:

521

if (not link in self._clues_secure_links and

522

not link in self._links_visited):

523

self._clues_secure_links.append(link)

524

for link in retriever._secure_links:

525

if (not link in self._secure_links and

526

not link in self._links_visited):

527

self._secure_links.append(link)

528

for link in retriever._clues_general_links:

529

if (not link in self._clues_general_links and

530

not link in self._links_visited):

531

self._clues_general_links.append(link)

532

for link in retriever._general_links:

533

if (not link in self._general_links and

534

not link in self._links_visited):

535

self._general_links.append(link)

def Run(self):

"""Runs the Crawler.

Creates a Retriever object and calls its run method to get the first links,

541

and then uses CurlMulti object and creates many Retriever objects to get

542

the subsequent pages.

543

544

The number of pages (=Retriever objs) created each time is restricted by

545

MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download

546

and parse their pages, we do the same again. The number of total pages

547

visited is kept in urls_visited.

548

If no registration page is found, the Crawler object will give up its try

549

after MAX_TOTAL_URLS_PER_DOMAIN is reached.

550

551

Returns:

552

True is returned if registration page is found, or False otherwise.

553

"""

554

reg_page_found = False

555

if self.url_error:

556

return False

557

r = Retriever(self._url, self._domain, self._cookie_file)

558

if r.Run():

559

self._LogRegPageFound(r)

560

reg_page_found = True

561

else:

562

self._url = r._url

563

self._domain = r._domain

564

self.logger.info('url to crawl: %s', self._url)

565

self.logger.info('domain: %s', self._domain)

566

self._links_visited.append(r._url)

self._GetNewLinks(r)

urls_visited = 1

while True:

if (not (self._clues_secure_links or self._secure_links or

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

571

self._clues_general_links or self._general_links) or

572

urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN):

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

573

break # Registration page not found.

574

m = pycurl.CurlMulti()

575

self._GetLinksPages(m)

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

576

urls_visited += len(self._retrievers_list)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

577

self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',

578

self._domain, urls_visited)

579

for r in self._retrievers_list:

580

if r.ParseAndGetLinks():

581

self._LogRegPageFound(r)

582

reg_page_found = True

583

break

584

else:

585

self.logger.info('parsed: %s', r._url)

self._GetNewLinks(r)

m.close()

if reg_page_found:

break

while self._retrievers_list:

591

r = self._retrievers_list.pop()

592

return reg_page_found

593

594

595

class WorkerThread(threading.Thread):

596

"""Creates a new thread of execution."""

597

def __init__(self, url):

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

598

"""Creates _url and page_found attri to populate urls_with_no_reg_page file.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

599

600

Used after thread's termination for the creation of a file with a list of

601

the urls for which a registration page wasn't found.

602

603

Args:

604

url: will be used as an argument to create a Crawler object later.

605

"""

606

threading.Thread.__init__(self)

607

self._url = url

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

608

self.page_found = False

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

609

610

def run(self):

611

"""Execution of thread creates a Crawler object and runs it.

612

613

Caution: this function name should not be changed to 'Run' or any other

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

614

names because it is overriding the 'run' method of the 'threading.Thread'

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

615

class. Otherwise it will never be called.

616

"""

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

617

self.page_found = Crawler(self._url).Run()

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

618

619

620

class ThreadedCrawler(object):

621

"""Calls the Run function of WorkerThread which creates & runs a Crawler obj.

622

623

The crawler object runs concurrently, examining one site each.

624

"""

625

logger = logging.getLogger(__name__)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

626

627

def __init__(self, urls_file, logging_level=None):

628

"""Creates threaded Crawler objects.

Args:

2011-06-29 18:51:59

[diff] [blame]

631

urls_file: a text file containing a URL in each line.

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

632

logging_level: verbosity level, default is None.

633

634

Raises:

635

IOError: If cannot find URLs from the list.

"""

if logging_level:

2011-04-29 19:15:53

[diff] [blame]

638

self.logger.setLevel(logging_level)

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

639

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

self._urls_list = []

f = open(urls_file)

try:

for url in f.readlines():

644

url = url.strip()

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

645

if not urlparse.urlparse(url)[0].startswith('http'):

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

646

self.logger.info(

647

'%s: skipping this (does not begin with "http://")', url)

648

continue

649

self._urls_list.append(url)

650

except IOError as e:

651

self.logger.error('Error: %s', e)

raise

finally:

f.close()

if not self._urls_list:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

656

error_msg = 'No URLs were found.'

657

self.logger.error('ERROR: %s', error_msg)

658

raise IOError(error_msg)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

659

660

def Run(self):

661

"""Runs Crawler objects using python threads.

662

663

Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.

664

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

665

Returns:

666

The number of registration pages found. -1 if no URLs are given.

667

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

668

Raises:

669

OSError: When creating the same directory that already exists.

670

"""

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

671

if self._urls_list:

672

allThreads = []

673

# originalNumThreads is the number of threads just before the

674

# ThreadedCrawler starts creating new threads. As a standalone script it

675

# will be 1.

676

originalNumThreads = threading.active_count()

677

for url in self._urls_list:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

678

self.logger.info('URL fed to a crawler thread: %s', url)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

679

t = WorkerThread(url)

680

t.start()

681

allThreads.append(t)

682

while threading.active_count() >= (

683

MAX_ALLOWED_THREADS + originalNumThreads):

684

time.sleep(.4)

685

while threading.active_count() > originalNumThreads:

686

time.sleep(.4)

687

self.logger.info('----------------')

688

self.logger.info('--- FINISHED ---')

689

self.logger.info('----------------')

690

urls_no = 0

691

urls_not_found_no = 0

692

not_file_name = os.path.join(

693

REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)

694

not_file_dir = os.path.dirname(not_file_name)

695

try:

696

os.makedirs(not_file_dir)

697

except OSError as e:

698

if e.errno != errno.EEXIST:

699

raise

700

fnot = open(not_file_name, 'wb')

701

try:

702

for t in sorted(allThreads, key=lambda t: t._url):

703

urls_no += 1

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

704

if not t.page_found:

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

705

urls_not_found_no += 1

706

fnot.write('%s' % t._url)

707

fnot.write(os.linesep)

708

except IOError as e:

709

self.logger.error('Error: %s', e)

finally:

fnot.close()

2011-06-29 18:51:59

[diff] [blame]

712

self.logger.info('Total number of URLs given: %d\n', urls_no)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

713

self.logger.info(

714

'Registration pages found: %d\n', (urls_no - urls_not_found_no))

715

self.logger.info(

716

'URLs that did not return a registration page: %d\n',

717

urls_not_found_no)

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

718

return urls_no - urls_not_found_no

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

719

else:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

720

self.logger.error('Error: no URLs were found.')

721

return -1

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

def main():

2011-04-29 19:15:53

[diff] [blame]

725

usage = 'usage: %prog [options] single_url_or_urls_filename'

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

726

parser = optparse.OptionParser(usage)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

727

parser.add_option(

728

'-l', '--log_level', metavar='LOG_LEVEL', default='error',

729

help='LOG_LEVEL: debug, info, warning or error [default: %default]')

730

731

(options, args) = parser.parse_args()

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

732

options.log_level = options.log_level.upper()

733

if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

734

print 'Wrong log_level argument.'

735

parser.print_help()

[email protected]

2fac375

2011-11-27 20:56:51

[diff] [blame]

736

return 1

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

737

options.log_level = getattr(logging, options.log_level)

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

if len(args) != 1:

2011-06-29 18:51:59

[diff] [blame]

740

parser.error('Wrong number of arguments.')

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

741

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

742

logger = logging.getLogger(__name__)

743

if options.log_level:

744

console = logging.StreamHandler()

745

logger.addHandler(console)

746

logger.setLevel(options.log_level)

747

748

arg_is_a_file = os.path.isfile(args[0])

749

if arg_is_a_file:

750

CrawlerClass = ThreadedCrawler

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

751

else:

[email protected]

732dd43b

2011-06-29 18:51:59

[diff] [blame]

752

CrawlerClass = Crawler

753

t0 = datetime.datetime.now()

754

c = CrawlerClass(args[0], options.log_level)

755

c.Run()

756

if not arg_is_a_file and c.url_error:

757

logger.error(

758

'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0])

759

t1 = datetime.datetime.now()

760

delta_t = t1 - t0

761

logger.info('Started at: %s\n', t0)

762

logger.info('Ended at: %s\n', t1)

763

logger.info('Total execution time: %s\n', delta_t)

[email protected]

2fac375

2011-11-27 20:56:51

[diff] [blame]

764

return 0

[email protected]

5883af98

2011-04-29 19:15:53

[diff] [blame]

765

766

767

if __name__ == "__main__":

[email protected]

2fac375

2011-11-27 20:56:51

[diff] [blame]

768

sys.exit(main())