
Ok, I've rewritten the roboshop script. Now it's shorter, and more reliable (as reliable as a webscrapper can be)
So If you want code(you got it) (yeah, I know it sounds better with "if you want blog, you got it", but there's little chance I can make this pun, and AC/DC is cool anyway).
The site seems to send different html pages depending on some (unknown to me) variables.
So I analyzed the two variants. At present version (alpha 0.01) it just outputs an array with prices, but I'm in the process of making it cooler, and send me mails when it finds some interesting thing in the INTERNETZ.
Here you can see the use for HTML::Element look_down with chained calls to get the desired node. Apart from cpan, there's more info here.
I'm thinking about writing a WWW::Search subclass too. We'll see...
Btw, I don't understand why perl isn't syntax-highlighted when embedded in the blog. If you have any pointers, I'd be very happy to hear a way to solve that. Sorry for the inconvenience.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use HTML::TreeBuilder; | |
use LWP::UserAgent; | |
use Data::Dump qw(ddx dump); | |
use Data::Dumper; | |
use Memoize; | |
use feature ':5.10'; | |
sub trimEUR { | |
my ($str) = @_; | |
$str =~ s/EUR (.+)$/$1/; | |
return $str; | |
} | |
sub getShippingRate { | |
#http://www.iberlibro.com/servlet/ShipRates?vid=51947087 | |
my ($id) = @_; | |
my $response = LWP::UserAgent->new(agent=>"Mozilla Firefox")->request( | |
HTTP::Request->new( GET => "http://www.iberlibro.com/servlet/ShipRates?vid=$id")); #$id | |
my $tree = HTML::TreeBuilder->new(); | |
$tree->parse($response->content); | |
$tree->eof; | |
my $t = ($tree->look_down( | |
'class','data' | |
))[0]; | |
my $row = ($t->look_down('_tag' , 'tr'))[1]; | |
my $col = ($row->look_down('_tag', 'td'))[1]; | |
$col->as_text =~ /EUR (.+)/; | |
return $1; | |
} | |
sub fixPaths { | |
my ($root) = @_; | |
foreach my $link ( | |
$root->look_down('_tag','a', | |
sub { | |
return unless $_[0]->attr('href') =~ /^\//; | |
$_[0]->attr('href', "http://iberlibro.com". $_[0]->attr('href')); | |
} | |
)) { | |
# say $link->dump; # | |
} | |
} | |
memoize('getShippingRate'); | |
my @conf = <DATA>; | |
my $h = eval join '' , @conf; | |
print Dumper $h; | |
my ($k,$v); | |
find($k, $v) while (($k,$v)= each %$h); | |
sub find{ | |
my ($book, $maxprice) =@_; | |
$book =~ s/ /\+/g; | |
say $book; | |
my $response = LWP::UserAgent->new(agent=>"Mozilla Firefox")->request( | |
#HTTP::Request->new(GET => "http://www.iberlibro.com/servlet/SearchResults?sts=t&tn=${book}&x=0&y=0")); | |
HTTP::Request->new( GET => | |
"http://www.iberlibro.com/servlet/SearchResults?sts=t&tn=%22${book}%22&x=0&y=0")); | |
#my $tree = HTML::TreeBuilder->new_from_file("webfonts.html"); | |
#workaround to be able to parse the first result table | |
$response = $response->content; | |
$response =~ s/<div id="mainbodydiv">//; | |
$response =~ s/<div id="header">//; | |
my $tree = HTML::TreeBuilder->new(); | |
$tree->parse($response); | |
$tree->eof; | |
my $root = $tree->root; | |
my @a = $root->look_down('_tag','table', 'class','result'); | |
my @fp ; | |
foreach my $res (@a) { | |
# print $res->dump; | |
# exit; | |
my @pr = $res->look_down('_tag','span' , 'class' , 'price'); | |
if (@pr == 2) { | |
push @fp , trimEUR($pr[0]->as_text) + trimEUR($pr[1]->as_text) ; | |
} | |
elsif (@pr==1){ | |
say ref $res; | |
my ($texto) = $res->look_down('_tag', 'td', 'class', 'shipping') | |
->look_down('_tag', 'span', 'class', 'scndInfo') | |
->look_down('_tag', 'a') | |
->attr('href') =~ /vid=(.+)$/; | |
push @fp, trimEUR($pr[0]->as_text) + getShippingRate($texto); | |
} | |
else { | |
say q(<span class="price" not found ) ; | |
# my @aver = $res->look_down('_tag','td' , 'class' , 'result-addToBasket'); | |
} | |
} | |
#push @prices, [ $i++ , $_->look_down('_tag','span' , 'class' , 'price')] for @a; | |
#say $_->[0] , " ", $_->[1]->as_text for @prices; | |
say $fp[0]; | |
#my $element = HTML::Element->new('b'); | |
#$element->push_content(getShippingRate($id) + $1); | |
#$scndInfo->push_content($element); | |
#fixPaths($root); | |
#open my $tmp , '>/tmp/a.html'; | |
#print $tmp $tree->as_HTML; | |
#close $tmp; | |
$tree->delete; | |
#system ('firefox /tmp/a.html'); | |
} | |
__END__ | |
{ 'higher order perl' => 25, | |
'paradigms of artificial intelligence programming' => 30 , | |
'design patterns smalltalk companion' => 25, | |
}; |
See ya in the next post.
Thanks for the shopped Artwork go to nabax. =)
1 comentario:
LULZ!
Però el meu domini encara no rula, estic esperant que els alemanys decideixin si sóc legal...
KBAI!
Publicar un comentario