rickwash.com_papers_cscw08-appendix_parse_all_url
rickwash.com_papers_cscw08-appendix_parse_all_url
Snapshot-Content-Location:
https://rickwash.com/papers/cscw08-appendix/parse_all_url.txt
Subject:
Date: Thu, 13 Feb 2025 12:17:15 -0700
MIME-Version: 1.0
Content-Type: multipart/related;
type="text/html";
boundary="----MultipartBoundary--
d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO----"
------MultipartBoundary--d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO----
Content-Type: text/html
Content-ID: <[email protected]>
Content-Transfer-Encoding: binary
Content-Location: https://rickwash.com/papers/cscw08-appendix/parse_all_url.txt
use File::Find;
# Config info
$db_name = "delicious2007";
$db_user = "root";
$db_passwd = "********";
sub create_tables
{
($sample) = @_;
$site_sql = "
CREATE TABLE `${sample}_site` (
`id` int(11) NOT NULL auto_increment,
`deliciousID` varchar(200) NOT NULL default '',
`title` varchar(400) default NULL,
`url` varchar(500) default NULL,
`user` varchar(200) default NULL,
`date` date default NULL,
`position` int(11) default NULL,
PRIMARY KEY (`id`),
KEY `deliciousID` (`deliciousID`),
KEY `date` (`date`),
KEY `position` (`position`),
KEY `user` (`user`),
KEY `id_date` (`deliciousID`,`date`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1";
$tag_sql = "
CREATE TABLE `${sample}_tag` (
`id` int(11) NOT NULL auto_increment,
`site_id` int(11) default NULL,
`tag` varchar(200) default NULL,
`position` int(11) default NULL,
PRIMARY KEY (`id`),
KEY `tag` (`tag`),
KEY `deliciousID` (`site_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1";
$dbh->do($site_sql);
$dbh->do($tag_sql);
}
sub fix_month {
$_ = shift;
s/Jan/01/g;
s/Feb/02/g;
s/Mar/03/g;
s/Apr/04/g;
s/May/05/g;
s/Jun/06/g;
s/Jul/07/g;
s/Aug/08/g;
s/Sep/09/g;
s/Oct/10/g;
s/Nov/11/g;
s/Dec/12/g;
return $_;
}
$meta_sql = "INSERT INTO ${sample}_tag (site_id, tag, position) VALUES (?, ?, ?)";
$meta_sth = $dbh->prepare($meta_sql);
# First pass through the file. Find the Title, URL, and total number of posts
sub pass_one()
{
# Get the Site ID number from the title
if (/<title>del.icio.us\/url\/([^<]*)<\/title>/)
{
$site_id = $1;
}
if (/<h4 class="nomb"><a href="([^\"]*)" rel="nofollow">
([^<]*)<\/a><\/h4>/){
$title = $2;
$url = $1;
}
if (/this url has been saved by (\d*) people/) {
$total_count = $1
}
}
# Pass two through the file. Find the individual posts and store them in the
database
sub pass_two()
{
if (/<h5 class="datehead">(...) &lsquo;(\d\d)<\/h5>/) {
$mo = fix_month($1);
if ($2 < 90) {
$date = "20$2-${mo}-01 00:00:00";
} else {
$date = "19$2-${mo}-01 00:00:00";
}
}
if (/<li><p>by/) { # Found entry
@tags = ();
$order = 1;
if (/who" href="[^\\"]*">([^<]*)/) {
$user = $1;
}
while (/(to) <a href="[^"]*">([^<]*)<\/a>(.*)/) {
push @tags, [ $2, $order ];
$_ = $1 . $3; # Remove the matched section
$order += 1;
}
$site_sth->execute($site_id, $title,$url,$user,$date, $number--) || warn
"SQL Error Inserting into site";
foreach my $i (@tags) {
($tag, $order) = @{$i};
$meta_sth->execute($id, $tag, $order) || warn "SQL Error Inserting into
metadata";
}
}
sub parse_file {
my ($fname) = @_;
$number = 1;
# First pass....
open(INFILE, "<", $fname) || warn("Cannot open $fname: $!");
# Loop through the file, reading one line at a time
while(<INFILE>) {
pass_one()
}
close INFILE;
$number = $total_count;
# Second Pass
open(INFILE, "<", $fname) || warn("Cannot open $fname: $!");
# Loop through the file, reading one line at a time
while(<INFILE>) {
pass_two()
}
close INFILE;
}
sub check_file {
if (!(/^.*\.html$/)) { return; }
parse_file($_);
}
find(\&check_file, ".");
</pre></body></html>
------MultipartBoundary--d34iuTKpDSVocPkCb4t5DuMOZRbCF6XhEoVT21BOZO------