randomfox (randomfox) wrote,
randomfox
randomfox

Perl Scripts for User Stats on the 10 Million Photos Flickr Group

These scripts generate a Top N list for users in the 10 Million Photos Flickr group.

1. ApiKey.pm, a short module containing the API key:


#!perl -w
use strict;

package ApiKey;
use base 'Exporter';
our @EXPORT = qw($api_key $shared_secret $auth_token $groupid);

our $api_key = "";
our $shared_secret = "";
our $auth_token = "";
our $groupid = '20759249@N00';

1;

__END__



2. TenCommon.pm, functions shared by all the scripts:


#!perl -w
use strict;

package TenCommon;
use base 'Exporter';
our @EXPORT = qw(
    FlickrRetry GenPhotoList GetDateaddedRange GetPage
    OpenDB CloseDB AddToDB HighestDateInDB OwnerStatsFromDB
);

use Flickr::API;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $method = shift;
    my $param = shift;

    $param->{auth_token} = $auth_token;

    my $retry_count = 0;
    my $response;
    do {
	my $api = new Flickr::API(
	    {
		'key' => $api_key,
		secret => $shared_secret
	    }
	);
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

# Generate a list of photos from the Flickr query response.
sub GenPhotoList {
    my $response = shift;

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content}, forcearray=>['photo']);

    my $photos = $xm->{photos};
    print "Page $photos->{page} of $photos->{pages}...\n";

    my $photolist = $photos->{photo};
    my @photoarr;

    for my $id (keys %{$photolist}) {
	my $photo = $photolist->{$id};
	$photo->{id} = $id;
	$photo->{url} = "http://www.flickr.com/photos/$photo->{owner}/$photo->{id}";
	push @photoarr, $photo;
    }
    ( $photos->{pages}, \@photoarr );
}

# Get highest and lowest values of the dateadded field.
sub GetDateaddedRange {
    my $photolist = shift;
    my $lodate = 0x7FFFFFFF;
    my $hidate = 0;

    for my $photo (@$photolist) {
	my $date = $photo->{dateadded};
	$date < $lodate and $lodate = $date;
	$date > $hidate and $hidate = $date;
    }

    ($lodate, $hidate);
}

sub GetPage {
    my $groupid = shift;
    my $pagenum = shift;
    my $pagelen = shift;

    my $response = FlickrRetry("flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    per_page => $pagelen,
	    page => $pagenum
	});

    die "Error: $response->{error_message}\n" unless $response->{success};

    my ($totalpages, $photolist) = GenPhotoList($response);
    my ($lodate, $hidate) = GetDateaddedRange($photolist);

    ( $totalpages, $lodate, $hidate, $photolist );
}


use DBI;

my $DBNAME = "10pool.db";
my $TBLNAME = "statstable";

sub OpenDB {
    my $dbh = DBI->connect("dbi:SQLite:dbname=$DBNAME", "", "",
	{ RaiseError => 1, AutoCommit => 1 });

    # Create the table if it does not exist.
    my $sth = $dbh->prepare("PRAGMA table_info($TBLNAME)")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row = $sth->fetchrow_array;
    if (!@row) {
	$sth->err and
	    die $sth->errstr;
	print "Table $TBLNAME not present\n";
	$sth = $dbh->prepare("CREATE TABLE $TBLNAME (" .
	    "key INTEGER PRIMARY KEY, " .
	    "startpage INTEGER, endpage INTEGER, " .
	    "lowdate INTEGER, highdate INTEGER, " . 
	    "stats TEXT, " . 
	    "timestamp INTEGER" .
	    ")") or die $dbh->errstr;
	$sth->execute
	    or die $sth->errstr;
    }
    $dbh;
}

sub CloseDB {
    my $dbh = shift;
    $dbh->disconnect;
}

sub AddToDB {
    my $dbh = shift;

    my $startpage = shift;
    my $endpage = shift;
    my $lowdate = shift;
    my $highdate = shift;
    my $stats = shift;

    my $sth = $dbh->prepare("INSERT INTO $TBLNAME (startpage, endpage, lowdate, highdate, stats, timestamp) VALUES (?, ?, ?, ?, ?, ?)") 
	or die $dbh->errstr;
    $sth->execute($startpage, $endpage, $lowdate, $highdate, $stats, time) 
	or die $sth->errstr;
}

sub HighestDateInDB {
    my $dbh = shift;

    my $highestdate = 0;
    my $toppage = 0;

    my $sth = $dbh->prepare("SELECT endpage, highdate FROM $TBLNAME WHERE highdate = (SELECT max(highdate) FROM $TBLNAME)")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row;
    if (@row = $sth->fetchrow_array) {
	($toppage, $highestdate) = @row;
    }
    elsif ($sth->err) {
	die $sth->errstr;
    }

    ($highestdate, $toppage);
}

sub OwnerStatsFromDB {
    my $dbh = shift;

    my %owners;

    my $sth = $dbh->prepare("SELECT stats FROM $TBLNAME")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row;
    while (@row = $sth->fetchrow_array) {
	for my $line (split(/\n/, $row[0])) {
	    my ($count, $owner, $ownername) = split(/,/, $line, 3);
	    defined $ownername or next;

	    unless (defined $owners{$owner}) {
		$owners{$owner} = { 
		    owner => $owner,
		    name => $ownername,
		    count => 0 
		};
	    }
	    $owners{$owner}{count} += $count;
	}
    }
    $sth->err and die $sth->errstr;

    \%owners;
}

1;

__END__



3. runten2.pl, scan the group pool and update the database incrementally:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use TenCommon;
use ApiKey;

my $pagelen = 500;
my $pageclusterlen = 10;

sub GetTotalPages {
    my ($totalpages, undef, undef, undef) = GetPage($groupid, 1, $pagelen);
    $totalpages;
}

sub GetDateRange {
    my $pagenum = shift;
    my (undef, $lodate, $hidate, undef) = GetPage($groupid, $pagenum, $pagelen);
    ($lodate, $hidate);
}

# Get the highest date from output files in this directory.
sub GetHighestDate_old {
    my $highestdate = 0;
    my $toppage = 0;

    my @filelist = File::DosGlob::glob "x*.htm";
    for my $file (@filelist) {
	my $fh = new FileHandle $file, "r";
	my $firstline = <$fh>;
	if ($firstline =~ /Pages \d+ to (\d+): Dates \d+ to (\d+):/) {
	    my $hipage = $1;
	    my $hidate = $2;
	    if ($hidate > $highestdate) {
		$highestdate = $hidate;
		$toppage = $hipage;
	    }
	}
	$fh->close;
    }

    return ($highestdate, $toppage);
}

sub GetHighestDate {
    my $dbh = OpenDB;
    my ($highestdate, $toppage) = HighestDateInDB $dbh;
    CloseDB $dbh;
    ($highestdate, $toppage);
}

# Do binary search to find the page number containing a specific date.
sub FindPage {
    my $lower = 1;
    my $upper = shift;
    my $date = shift;

    print "Binary search: low=$lower upr=$upper Looking for date $date...\n";

    while (1) {
	my $mid = int(($lower + $upper) / 2);

	my ($lodate, $hidate) = GetDateRange($mid);

	print "Binary search: low=$lower upr=$upper mid=$mid Dates $lodate to $hidate...\n";
	if ($date > $hidate) {
	    $upper = $mid;
	}
	elsif ($date < $lodate) {
	    $lower = $mid;
	}
	else {
	    return $mid;
	}
    }
}

sub main {
    my $totalpages = GetTotalPages;
    print "Total pages = $totalpages\n";

    my ($highestdate, $toppage) = GetHighestDate();
    print "Highest date = $highestdate, Top page = $toppage\n";

    my $highestpage = ($highestdate ? 
	FindPage($totalpages, $highestdate) : 
	$totalpages);

    while (1) {
	# Look through the whole list of files every time in case the most
	# recent run of tenmil.pl failed.
	my ($hidate, $hipage) = GetHighestDate();
	my $endset = $highestpage;
	$hidate > $highestdate and $endset = $hipage - $pageclusterlen;

	print "hidate = $hidate, hipage = $hipage, endset = $endset\n";

	last if $endset <= 0;

	my $startset = $endset - $pageclusterlen + 1;
	$startset < 1 and $startset = 1;
	print "startset = $startset, endset = $endset\n";

	system "perl tenmil.pl $startset $endset $highestdate";
    }
}

main;

__END__



4. tenshort.pl, use Flickr API to correct the stats for the Top 150 and produce the report:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use TenCommon;
use ApiKey;

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

# Count a user's photos by setting the page length to 1 and getting the
# number of pages.
sub GetCount {
    my $groupid = shift;
    my $userid = shift;

    my $response = FlickrRetry("flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    user_id => $userid,
	    per_page => 1
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content});
    my $photos = $xm->{photos};

    $photos->{pages};
}

sub main {
    my $dbh = OpenDB;
    my $owners = OwnerStatsFromDB $dbh;
    CloseDB $dbh;

    my @topusers = (sort { $b->{count} <=> $a->{count} } values %$owners) [0..149];

    my $lineno = 0;
    for my $user (@topusers) {
	$lineno++;
	print "$lineno. Getting photocount for user $user->{name}...\n";
	$user->{count} = GetCount($groupid, $user->{owner});
    }

    @topusers = sort { $b->{count} <=> $a->{count} } @topusers;

    my $fh = new FileHandle "newcount.txt", "w";
    defined $fh or die "Can't open newcount.txt for writing: $!\n";

    $lineno = 0;
    for my $user (@topusers) {
	$lineno++;
	print $fh "$lineno. $user->{name}: $user->{count}\n";
    }

    $fh->close;
}

main;

__END__



5. tenconvert.pl, convert from old flatfile database to SQLite:


#!perl -w
use strict;

# Add datafiles to database.
# Usage: tenconvert.pl x*.htm

use FileHandle;
use File::DosGlob;

use TenCommon;

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

sub ProcessFiles {
    my $dbh = shift;

    for my $file (@_) {
	my $fh = new FileHandle $file, "r";
	defined $fh or die "Error opening file $file for reading: $!\n";
	my $line = <$fh>;
	next unless $line =~ /Pages (\d+) to (\d+): Dates (\d+) to (\d+):/;

	my $startpage = $1;
	my $endpage = $2;
	my $lowdate = $3;
	my $highdate = $4;

	my $stats = '';
	while (defined($line = <$fh>)) {
	    $stats .= $line;
	}

	$fh->close;

	AddToDB($dbh, $startpage, $endpage, $lowdate, $highdate, $stats);
    }
}

my $dbh = OpenDB;
# Much faster to turn AutoCommit off if adding many records.
$dbh->{AutoCommit} = 0;
ProcessFiles($dbh, glob_args @ARGV);
$dbh->commit;
CloseDB $dbh;

__END__

Subscribe
  • Post a new comment

    Error

    Anonymous comments are disabled in this journal

    default userpic

    Your reply will be screened

    Your IP address will be recorded 

  • 0 comments