0% found this document useful (0 votes)
17 views

rickwash.com_papers_cscw08-appendix_parse_all_url

The document contains a Perl script named 'parse_all_url.pl' created by Rick Wash, which is designed to download and parse pages from del.icio.us and save them as HTML files. It includes functionality to create database tables for storing site and tag information, as well as methods for processing HTML files to extract relevant data. The script provides commands for creating a database and parsing saved HTML files to populate the database with the extracted information.

Uploaded by

zackm198613
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views

rickwash.com_papers_cscw08-appendix_parse_all_url

The document contains a Perl script named 'parse_all_url.pl' created by Rick Wash, which is designed to download and parse pages from del.icio.us and save them as HTML files. It includes functionality to create database tables for storing site and tag information, as well as methods for processing HTML files to extract relevant data. The script provides commands for creating a database and parsing saved HTML files to populate the database with the extracted information.

Uploaded by

zackm198613
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

From: <Saved by Blink>

Snapshot-Content-Location:
https://rickwash.com/papers/cscw08-appendix/parse_all_url.txt
Subject:
Date: Thu, 13 Feb 2025 12:17:15 -0700
MIME-Version: 1.0
Content-Type: multipart/related;
type="text/html";
boundary="----MultipartBoundary--
d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO----"

------MultipartBoundary--d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO----
Content-Type: text/html
Content-ID: <[email protected]>
Content-Transfer-Encoding: binary
Content-Location: https://rickwash.com/papers/cscw08-appendix/parse_all_url.txt

<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-


8"><meta name="color-scheme" content="light dark"></head><body><pre style="word-
wrap: break-word; white-space: pre-wrap;">#!/usr/bin/perl
# Copyright (c) 2008 Rick Wash &lt;[email protected]&gt;
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# parse_all_url.pl
# Rick Wash &lt;[email protected]&gt;
#
# Download a ton of pages from del.icio.us/URL (or from a person's delicious page)
# and save them as .html files.
# Then, you can run this file:
# - parse_all_url create &lt;name&gt;
# (creates the database tables &lt;name&gt;_site and &lt;name&gt;_tag)
# - parse_all_url &lt;name&gt;
# (parses all of the .html files in the current and all sub-directories)

use File::Find;

# Config info
$db_name = "delicious2007";
$db_user = "root";
$db_passwd = "********";

# Connect to the Database


use DBI;
$dbh = DBI-&gt;connect("dbi:mysql:$db_name", $db_user, $db_passwd);

sub create_tables
{
($sample) = @_;
$site_sql = "
CREATE TABLE `${sample}_site` (
`id` int(11) NOT NULL auto_increment,
`deliciousID` varchar(200) NOT NULL default '',
`title` varchar(400) default NULL,
`url` varchar(500) default NULL,
`user` varchar(200) default NULL,
`date` date default NULL,
`position` int(11) default NULL,
PRIMARY KEY (`id`),
KEY `deliciousID` (`deliciousID`),
KEY `date` (`date`),
KEY `position` (`position`),
KEY `user` (`user`),
KEY `id_date` (`deliciousID`,`date`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1";
$tag_sql = "
CREATE TABLE `${sample}_tag` (
`id` int(11) NOT NULL auto_increment,
`site_id` int(11) default NULL,
`tag` varchar(200) default NULL,
`position` int(11) default NULL,
PRIMARY KEY (`id`),
KEY `tag` (`tag`),
KEY `deliciousID` (`site_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1";

$dbh-&gt;do($site_sql);
$dbh-&gt;do($tag_sql);
}

sub fix_month {
$_ = shift;
s/Jan/01/g;
s/Feb/02/g;
s/Mar/03/g;
s/Apr/04/g;
s/May/05/g;
s/Jun/06/g;
s/Jul/07/g;
s/Aug/08/g;
s/Sep/09/g;
s/Oct/10/g;
s/Nov/11/g;
s/Dec/12/g;

return $_;
}

$first = shift; # first parameter from command line


if ($first eq "create") {
$sample = shift;
create_tables($sample);
exit(0);
} else {
$sample = $first;
}

$site_sql = "INSERT INTO ${sample}_site (deliciousID, title, url, user, date,


position) VALUES (?, ?, ?, ?, ?, ?)";
$site_sth = $dbh-&gt;prepare($site_sql);

$meta_sql = "INSERT INTO ${sample}_tag (site_id, tag, position) VALUES (?, ?, ?)";
$meta_sth = $dbh-&gt;prepare($meta_sql);

# First pass through the file. Find the Title, URL, and total number of posts
sub pass_one()
{
# Get the Site ID number from the title
if (/&lt;title&gt;del.icio.us\/url\/([^&lt;]*)&lt;\/title&gt;/)
{
$site_id = $1;
}
if (/&lt;h4 class="nomb"&gt;&lt;a href="([^\"]*)" rel="nofollow"&gt;
([^&lt;]*)&lt;\/a&gt;&lt;\/h4&gt;/){
$title = $2;
$url = $1;
}
if (/this url has been saved by (\d*) people/) {
$total_count = $1
}
}

# Pass two through the file. Find the individual posts and store them in the
database
sub pass_two()
{
if (/&lt;h5 class="datehead"&gt;(...) &amp;lsquo;(\d\d)&lt;\/h5&gt;/) {
$mo = fix_month($1);
if ($2 &lt; 90) {
$date = "20$2-${mo}-01 00:00:00";
} else {
$date = "19$2-${mo}-01 00:00:00";
}
}
if (/&lt;li&gt;&lt;p&gt;by/) { # Found entry
@tags = ();
$order = 1;
if (/who" href="[^\\"]*"&gt;([^&lt;]*)/) {
$user = $1;
}
while (/(to) &lt;a href="[^"]*"&gt;([^&lt;]*)&lt;\/a&gt;(.*)/) {
push @tags, [ $2, $order ];
$_ = $1 . $3; # Remove the matched section
$order += 1;
}
$site_sth-&gt;execute($site_id, $title,$url,$user,$date, $number--) || warn
"SQL Error Inserting into site";

$id = $dbh-&gt;last_insert_id(undef, undef, undef, undef);

foreach my $i (@tags) {
($tag, $order) = @{$i};
$meta_sth-&gt;execute($id, $tag, $order) || warn "SQL Error Inserting into
metadata";
}
}

sub parse_file {
my ($fname) = @_;

$number = 1;
# First pass....
open(INFILE, "&lt;", $fname) || warn("Cannot open $fname: $!");
# Loop through the file, reading one line at a time
while(&lt;INFILE&gt;) {
pass_one()
}
close INFILE;

$number = $total_count;
# Second Pass
open(INFILE, "&lt;", $fname) || warn("Cannot open $fname: $!");
# Loop through the file, reading one line at a time
while(&lt;INFILE&gt;) {
pass_two()
}
close INFILE;
}

sub check_file {
if (!(/^.*\.html$/)) { return; }
parse_file($_);
}

find(\&amp;check_file, ".");
</pre></body></html>
------MultipartBoundary--d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO------

You might also like