BlogDump.pm
#!/usr/bin/perl
package BlogDump;
use strict;
use warnings;
use Text::Unaccent;
use Data::Dumper; # debug
use WWW::Mechanize;
use Article;
use constant FEED => '/feed/rss';
#
# konstruktor
#
sub new {
my $self = {
url => undef,
_bot => undef
};
bless $self, 'BlogDump';
$self->{_bot} = $self->newBot();
return $self;
}
sub dump {
my ($self, $url) = @_;
$self->{url} = $url;
my $b = $self->{_bot};
$b->get($url.FEED);
my $c = $b->content;
my @r;
while($c =~ /(http:\/\/[^\/<]+\/[^<]+)/g) {
print "- ".$1."\n";
push(@r, $self->dumpArticle($1));
}
return @r;
}
sub dumpArticle {
my ($self, $url) = @_;
my $b = $self->{_bot};
$b->get($url);
my $c = $b->content;
$a = Article->new();
$a->{url} = $url;
$c =~ /<h1>([^<]+)/s;
$a->{title} = $1;
print " - $1\n";
$c =~ /<div class="article">\s*<div class="top">(.*?)<\/div>(.*?)<\/div>/s;
my $top = $1;
$a->{content} = $2;
if($top =~ />([^<]+)<\/a>/s) {
$a->{category} = $1;
$a->{category} =~ s/>//g;
$a->{category} =~ s/<//g;
$a->{category} =~ s/&/&/g;
}
$a->{title} =~ s/&/&/g;
$a->{title} =~ s/>//g;
$a->{title} =~ s/<//g;
return $a;
}
#
# Vytvori noveho robota
#
sub newBot {
my ($self) = @_;
my $bot = WWW::Mechanize->new();
$bot->_reset_page;
$bot->cookie_jar( HTTP::Cookies->new( file => "cookies.dat" ) );
$bot->agent_alias('Windows IE 6');
return $bot;
}
1;
Tagy:
perl 98 řádků | 2008-05-29 22:57:37 | air.kadlec@seznam.cz