When trying to run my perl script via cmd prompt my json string is returning [] I have read other posts, and fixed my database to be utf8 and the error still persists. I have tried two didfferent way to encode my perl string the first was $json = encode_json #temp_array which returns this error hash- or arrayref expected (not a simple scalar, use allow_nonref to allow this However, when I use this line $json_text = $json->encode(#temp_array) I just get []
Here is my perl:
my $json_text;
my $json = JSON->new->utf8;
my #temp_array =[];
my $temp_array;
while (#data = $query_handle->fetchrow_array())
{
my %json_hash = ();
my $hash_ref;
%json_hash = (
"User ID" => $data[0],
"Status" => $data[1],
"Last Password Reset" => $data[2],
"Reset Needed" => $data[3]
);
$hash_ref = \%json_hash;
push (#temp_array, $hash_ref);
}
print $json = encode_json #temp_array . "\n"; #encode with error
print $json_text = $json->encode(#temp_array) . "\n"; #encode with []
print $cgi->header(-type => "application/json", -charset => "utf-8");
print $json_text; #Prints []
So in my own testing, via the cmd prompt I know the while is retrieving the data from my db correctly and is building a hash, which I am assuming is correct.
Is it the fact I am pushing my hash reference to the array instead of the hash itself? Once I get this string built correctly, I will be calling it to an html via jquery
Thank you.
JSON expects references:
print $json = encode_json(\#temp_array) . "\n";
print $json_text = $json->encode(\#temp_array) . "\n";
Edit: Unless you enable allow_nonref.
Another edit: This line is wrong--
my #temp_array =[]; ## should be my #temp_array = ();
and this line overwrites the $json variable:
print $json = encode_json #temp_array . "\n"; ## the next line in your script shouldn't work
Last edit - untested:
my $json = JSON->new->utf8;
my #temp_array;
while (my #data = $query_handle->fetchrow_array()) {
my %json_hash = (
"User ID" => $data[0],
"Status" => $data[1],
"Last Password Reset" => $data[2],
"Reset Needed" => $data[3]
);
push (#temp_array, \%json_hash);
}
print $json->encode(\#temp_array) . "\n";
Related
I'm trying to parse JSON data in Perl. it is request to Cisco Prime Service. My script works, but parsing doesn't work. And I have a warning,
malformed JSON string, neither array, object, number, string or atom, at character offset 0 (before "HTTP::Response=HASH(...") at InternetBSP.pl line 39.
It is here:
my $json_text = $json->allow_nonref->utf8->relaxed->escape_slash->loose->allow_singlequote->allow_barekey->decode($res);
have no Idee how should I fix it...
use strict;
use warnings;
use JSON -support_by_pp;
use LWP 5.64;
use LWP::UserAgent;
use MIME::Base64;
use REST::Client;
use IO::Socket::SSL;
#So dass es auch ohne SSL Sertifizierung funktioniert
BEGIN { $ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0 }
#Create a user agent object
my $ua = LWP::UserAgent->new(
ssl_opts => {
SSL_verify_mode => SSL_VERIFY_NONE(),
verify_hostname => 0,
}
);
#Create a request
my $req = HTTP::Request->new( GET => 'https://10.10.10.10/webacs/api/v1/data/AccessPoints.json?.full=true' );
$req->content_type('application/json');
$req->authorization_basic( "Username", "Password" );
#Pass request to the user agent and get a response back
my $res = $ua->request($req);
#Check the outcome of the Response
if ( $res->is_success ) {
print $res->content;
} else {
print $res->status_line, "n";
}
my $json = new JSON;
my $json_text = $json->allow_nonref->utf8->relaxed->escape_slash->loose->allow_singlequote->allow_barekey->decode($res);
#my try to pasre the data
foreach my $ap ( #{ $json_text->{queryResponse}->{'entity'} } ) {
print "------------------------\nAccess Point " . $ap->{'accessPointsDTO'}->{'#id'} . "\n";
print "Model:" . $ap->{'accessPointsDTO'}->{'model'} . "\n";
print "MAC Address:" . $ap->{'accessPointsDTO'}->{'macAddress'} . "\n";
print "Serial Number:" . $ap->{'accessPointsDTO'}->{'serialNumber'} . "\n";
print "Software Version:" . $ap->{'accessPointsDTO'}->{'softwareVersion'} . "\n";
print "Status:" . $ap->{'accessPointsDTO'}->{'status'} . "\n";
print "Location:" . $ap->{'accessPointsDTO'}->{'location'} . "\n";
}
I have this like outcome:
{"queryResponse":{"#last":"7","#first":"0","#count":"8","#type":"AccessPoints","#responseType":"listEntityInstances","#requestUrl":"https:\/\/10.66.1.23\/webacs\/api\/v1\/ data\/AccessPoints?.full=true","#rootUrl":"https:\/\/10.66.1.23\/webacs\/api\/v1\/data","entity":[{"#dtoType":"accessPointsDTO","#type":"AccessPoints","#url":"https:\/\/10 .66.1.23\/webacs\/api\/v1\/data\/AccessPoints\/205320"
But it shoud be smth like:
{"queryResponse":
{"#type":"AccessPoints",
"#rootUrl":"https://172.18.138.90/webacs/api/v1/data",
"#requestUrl":"https://172.18.138.90/webacs/api/v1/data/AccessPoints?.full=true",
"#responseType":"listEntityInstances",
"entity":[
{"#url":"https://172.18.138.90/webacs/api/v1/data/AccessPoints/13544533",
"#type":"AccessPoints",
"#dtoType":"accessPointsDTO",
"accessPointsDTO":
{"#id":"13544533",
"#displayName":"13544533",
"adminStatus":"ENABLE",
"bootVersion":"12.4.23.0",
"clientCount":0,
After update :)
------------------------
Access Point 205320
Model:AIR-LAP1142N-E-K9
MAC Address:6c:9c:ed:b5:45:60
Serial Number:FCZ1544W51B
Software Version:7.6.130.0
Status:CLEARED
Location:de.bw.stu.
------------------------
Access Point 205322
Model:AIR-CAP3502I-E-K9
MAC Address:0c:f5:a4:ee:70:10
Serial Number:FCZ184680VB
Software Version:7.6.130.0
Status:CLEARED
Location:de.bw.stu.
------------------------
Access Point 205324
Model:AIR-LAP1142N-E-K9
MAC Address:6c:9c:ed:86:9d:20
Serial Number:FCZ1544W50Y
Software Version:7.6.130.0
Status:CLEARED
Location:de.bw.stu.
malformed JSON string, neither array, object, number, string or atom, at character offset 0 (before "HTTP::Response=HASH(...")
This error message means that the data you are giving to decode is not JSON.
You are passing $res to decode, which is an HTTP::Response object (see above, emphasis mine). You need to use $res->content, which you use for debugging output a few lines above.
if ($res->is_success) {
print $res->content;
} else {print $res->status_line, "n";
}
I would rewrite that whole block of code to this.
die $res->status_line unless $res->is_success;
my $json = JSON->new->allow_nonref
->utf8->relaxed
->escape_slash->loose
->allow_singlequote->allow_barekey;
my $json_text = $json->decode( $res->content );
Instead of printing some debug output and then going on anyway if things went wrong you can just die if the request was not successful.
After that, create your JSON object and configure it. This is way more readable than this long line of code, and we're using a method call to new instead of indirect object notation.
Finally, we are decodeing $res->content.
I have the JSON text as given below :
test.json
{
"a" : false
}
I want to create the DBM::Deep hash for above JSON. My code is looks like as given below :
dbm.pl
use strict;
use warnings;
use DBM::Deep;
use JSON;
use Data::Dumper;
# create the dbm::deep object
my $db = DBM::Deep->new(
file => 'test.db',
type => DBM::Deep->TYPE_HASH
);
my $json_text = do {
open( my $json_fh, $path )
or die("Can't open \$path\": $!\n");
local $/;
<$json_fh>;
};
my $json = JSON->new;
my $data = $json->decode($json_text);
print Dumper($data);
# create dbm::deep hash
eval { $db->{$path} = $data; };
if ($#) {
print "error : $#\n";
}
I am getting below output/error on execution of above code:
Error
$VAR1 = {
'a' => bless( do{(my $o = 0)}, 'JSON::XS::Boolean' )
};
error : DBM::Deep: Storage of references of type 'SCALAR' is not supported. at dbm.pl line 26
It seems like, JSON internally uses JSON::XS which convert the 'true' value in JSON::XS::Boolean object and DBM::Deep is not able to handle this, while it can handle the null value.
While the above code is working fine for below inputs:
{
"a" : 'true' # if true is in quotes
}
or
{
"a" : null
}
I tried many thing, but nothing worked. Does anyone has any workaround?
The JSON parser you are using, among others, returns an object that works as a boolean when it encounters true or false in the JSON. This allows the data to be re-encoded into JSON without change, but it can cause this kind of issue.
null doesn't have this problem because Perl has a native value (undef) that can be used to represent it unambiguously.
The following convert these objects into simple values.
sub convert_json_bools {
local *_convert_json_bools = sub {
my $ref_type = ref($_[0])
or return;
if ($ref_type eq 'HASH') {
_convert_json_bools($_) for values(%{ $_[0] });
}
elsif ($ref_type eq 'ARRAY') {
_convert_json_bools($_) for #{ $_[0] };
}
elsif ($ref_type =~ /::Boolean\z/) {
$_[0] = $_[0] ? 1 : 0;
}
else {
warn("Unsupported type $ref_type\n");
}
};
&_convert_json_bools;
}
convert_json_bools($data);
Your code works fine for me, with the only change being to set
my $path = 'test.json';
You should check your module version numbers. These are the ones that I have
print $DBM::Deep::VERSION, "\n"; # 2.0013
print $JSON::VERSION, "\n"; # 2.90
print $JSON::XS::VERSION, "\n"; # 3.02
and I am running Perl v5.24.0
The dumped output is as follows
Newly-created DBM::Deep database
$VAR1 = bless( {}, 'DBM::Deep::Hash' );
output of $json->decode
$VAR1 = {
'a' => undef
};
Populated DBM::Deep database after the eval
$VAR1 = bless( {
'test.json' => bless( {
'a' => undef
}, 'DBM::Deep::Hash' )
}, 'DBM::Deep::Hash' );
All of that looks to be as it should
I have a configuration file which is in XML format. I need to parse the XML and convert to JSON. I'm able to convert it with XML2JSON module of perl. But the problem is, it is not maintaining the order of XML elements. I strictly need the elements in order otherwise I cannot configure
My XML file is something like this. I have to configure an IP address and set that IP as a gateway to certain route.
<Config>
<ip>
<address>1.1.1.1</address>
<netmask>255.255.255.0</netmask>
</ip>
<route>
<network>20.20.20.0</network>
<netmask>55.255.255.0</netmask>
<gateway>1.1.1.1</gateway>
</route>
</Config>
This is my perl code to convert to JSON
my $file = 'config.xml';
use Data::Dumper;
open my $fh, '<',$file or die;
$/ = undef;
my $data = <$fh>;
my $XML = $data;
my $XML2JSON = XML::XML2JSON->new();
my $Obj = $XML2JSON->xml2obj($XML);
print Dumper($Obj);
The output I'm getting is,
$VAR1 = {'Config' => {'route' => {'netmask' => {'$t' => '55.255.255.0'},'gateway' => {'$t' => '1.1.1.1'},'network' => {'$t' => '20.20.20.0'}},'ip' => {'netmask' => {'$t' => '255.255.255.0'},'address' => {'$t' => '1.1.1.1'}}},'#encoding' => 'UTF-8','#version' => '1.0'};
I have a script which reads the json object and configure..
But it fails as it first tries to set gateway ip address to a route where the ip address is not yet configured and add then add ip address.
I strictly want key ip to come first and then route for proper configuration without error. Like this I have many dependencies where order of keys is a must.
Is there any way I can tackle this problem? I tried almost all modules of XML parsing like XML::Simple,Twig::XML,XML::Parser. But nothing helped..
Here's a program that I hacked together that uses XML::Parser to parse some XML data and generate the equivalent JSON in the same order. It ignores any attributes, processing instructions etc. and requires that every XML element must contain either a list of child elements or a text node. Mixing text and elements won't work, and this isn't checked except that the program will die trying to dereference a string
It's intended to be a framework for you to enhance as you require, but works fine as it stands with the XML data you show in your question
use strict;
use warnings 'all';
use XML::Parser;
my $parser = XML::Parser->new(Handlers => {
Start => \&start_tag,
End => \&end_tag,
Char => \&text,
});
my $struct;
my #stack;
$parser->parsefile('config.xml');
print_json($struct->[1]);
sub start_tag {
my $expat = shift;
my ($tag, %attr) = #_;
my $elem = [ $tag => [] ];
if ( $struct ) {
my $content = $stack[-1][1];
push #{ $content }, $elem;
}
else {
$struct = $elem;
}
push #stack, $elem;
}
sub end_tag {
my $expat = shift;
my ($elem) = #_;
die "$elem <=> $stack[-1][0]" unless $stack[-1][0] eq $elem;
for my $content ( $stack[-1][1] ) {
$content = "#$content" unless grep ref, #$content;
}
pop #stack;
}
sub text {
my $expat = shift;
my ($string) = #_;
return unless $string =~ /\S/;
$string =~ s/\A\s+//;
$string =~ s/\s+\z//;
push #{ $stack[-1][1] }, $string;
}
sub print_json {
my ($data, $indent, $comma) = (#_, 0, '');
print "{\n";
for my $i ( 0 .. $#$data ) {
# Note that $data, $indent and $comma are overridden here
# to reflect the inner context
#
my $elem = $data->[$i];
my $comma = $i < $#$data ? ',' : '';
my ($tag, $data) = #$elem;
my $indent = $indent + 1;
printf qq{%s"%s" : }, ' ' x $indent, $tag;
if ( ref $data ) {
print_json($data, $indent, $comma);
}
else {
printf qq{"%s"%s\n}, $data, $comma;
}
}
# $indent and $comma (and $data) are restored here
#
printf "%s}%s\n", ' ' x $indent, $comma;
}
output
{
"ip" : {
"address" : "1.1.1.1",
"netmask" : "255.255.255.0"
},
"route" : {
"network" : "20.20.20.0",
"netmask" : "55.255.255.0",
"gateway" : "1.1.1.1"
}
}
The problem isn't so much to do with XML parsing, but because perl hashes are not ordered. So when you 'write' some JSON... it can be any order.
The way to avoid this is to apply a sort function to your JSON.
You can do this by using sort_by to explicitly sort:
#!/usr/bin/env perl
use strict;
use warnings;
use XML::Twig;
use JSON::PP;
use Data::Dumper;
sub order_nodes {
my %rank_of = ( ip => 0, route => 1, address => 2, network => 3, netmask => 4, gateway => 5 );
print "$JSON::PP::a <=> $JSON::PP::b\n";
return $rank_of{$JSON::PP::a} <=> $rank_of{$JSON::PP::b};
}
my $twig = XML::Twig -> parse (\*DATA);
my $json = JSON::PP -> new;
$json ->sort_by ( \&order_nodes );
print $json -> encode( $twig -> simplify );
__DATA__
<Config>
<ip>
<address>1.1.1.1</address>
<netmask>255.255.255.0</netmask>
</ip>
<route>
<network>20.20.20.0</network>
<netmask>55.255.255.0</netmask>
<gateway>1.1.1.1</gateway>
</route>
</Config>
In some scenarios, setting canonical can help, as that sets ordering to lexical order. (And means your JSON output would be consistently ordered). This doesn't apply to your case.
You could build the node ordering via XML::Twig, either by an xpath expression, or by using twig_handlers. I gave it a quick go, but got slightly unstuck in figuring out how you'd 'tell' how to figure out ordering based on getting address/netmask and then network/netmask/gateway.
As a simple example you could:
my $count = 0;
foreach my $node ( $twig -> get_xpath ( './*' ) ) {
$rank_of{$node->tag} = $count++ unless $rank_of{$node->tag};
}
print Dumper \%rank_of;
This will ensure ip and route are always the right way around. However it doesn't order the subkeys.
That actually gets a bit more complicated, as you'd need to recurse... and then decide how to handle 'collisions' (like netmask - address comes before, but how does it sort compared to network).
Or alternatively:
my $count = 0;
foreach my $node ( $twig->get_xpath('.//*') ) {
$rank_of{ $node->tag } = $count++ unless $rank_of{ $node->tag };
}
This walks all the nodes, and puts them in order. It doesn't quite work, because netmask appears in both stanzas though.
You get:
{"ip":{"address":"1.1.1.1","netmask":"255.255.255.0"},"route":{"netmask":"55.255.255.0","network":"20.20.20.0","gateway":"1.1.1.1"}}
I couldn't figure out a neat way of collapsing both lists.
New to Perl and I am digging what I can do as well as the support and documentation available for all these great libraries; however, I am having an issue with a script I am working on. Prior to implementing HTML::TagFilter, I was using line 63 (print FH $tree->as_HTML) to print to file the html content I was looking for. I looked specifically for everything in the body tag. Now I'd like to only print out the p tags, h tags, and img tags without any attributes. When I run my code, the files are created in the proper directory but in each file a hash object is printed (HTML::Element=HASH(0x3a104c8)).
use open qw(:locale);
use strict;
use warnings qw(all);
use HTML::TreeBuilder 5 -weak; # Ensure weak references in use
use URI::Split qw/ uri_split uri_join /;
use HTML::TagFilter;
my #links;
open(FH, "<", "index/site-index.txt")
or die "Failed to open file: $!\n";
while(<FH>) {
chomp;
push #links, $_;
}
close FH;
my $dir = "";
while($dir eq ""){
print "What is the name of the site we are working on? ";
$dir = <STDIN>;
chomp $dir;
}
#make directory to store files
mkdir($dir);
my $entities = "";
my $indent_char = "\t";
my $filter = HTML::TagFilter->new(
allow=>{ p => { none => [] }, h1 => { none => [] }, h2 => { none => [] }, h3 => { none => [] }, h4 => { none => [] }, h5 => { none => [] }, h6 => { none => [] }, img => { none => [] }, },
log_rejects => 1,
strip_comments => 1
);
foreach my $url (#links){
#print $url;
my ($filename) = $url =~ m#([^/]+)$#;
#print $filename;
$filename =~ tr/=/_/;
$filename =~ tr/?/_/;
#print "\n";
my $currentfile = $dir . '/' . $filename . '.html';
print "Preparing " . $currentfile . "\n" . "\n";
open (FH, '>', $currentfile)
or die "Failed to open file: $!\n";
my $tree = HTML::TreeBuilder->new_from_url($url);
$tree->parse($url);
$tree = $tree->look_down('_tag', 'body');
if($tree){
$tree->dump; # a method we inherit from HTML::Element
print FH $filter->filter($tree);
#print FH $tree->as_HTML($entities, $indent_char), "\n";
} else{
warn "No body tag found";
}
print "File " . $currentfile . " completed.\n" . "\n";
close FH;
}
Why is this happening and how can I print the actual content I am looking for?
Thank you.
$filter->filter() expects HTML, HTML::TreeBuilder is not HTML, but a subclass of HTML::Element. look_down() returns a HTML::Element. That is what you see from your print, because when you treat this reference as a string, you will get the string representation of the object. HTML::Element=HASH(0x7f81509ab6d8), which means that the object HTML::Element, which is solved by a HASH structure and the memory address of this object.
You can fix it all by calling filter with the HTML from the look_down:
print FH $filter->filter($tree->as_HTML);
I'm usually no Perl coder. However I've got to complete this task.
The following code works for me:
#!/usr/bin/perl
use LWP::UserAgent;
use JSON;
use strict;
my $md5 = $ARGV[0];
$md5 =~ s/[^A-Fa-f0-9 ]*//g;
die "invalid MD5" unless ( length($md5) == 32 );
my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 1 }, timeout => 10);
my $key="12345...7890";
my $url='https://www.virustotal.com/vtapi/v2/file/report';
my $response = $ua->post( $url, ['apikey' => $key, 'resource' => $md5] );
die "$url error: ", $response->status_line unless $response->is_success;
my $results=$response->content;
my $json = JSON->new->allow_nonref;
my $decjson = $json->decode( $results);
print "md5: ",$md5,"\n";
print "positives: ", $decjson->{"positives"}, "\n";
print "total: ", $decjson->{"total"}, "\n";
print "date: ", $decjson->{"scan_date"}, "\n";
Now I would like to recode the above for using asynchronous http using Mojo. I'm trying this:
#!/usr/bin/perl
use warnings;
use strict;
use Mojo;
use Mojo::UserAgent;
my $md5 = $ARGV[0];
$md5 =~ s/[^A-Fa-f0-9 ]*//g;
die "invalid MD5" unless ( length($md5) == 32 );
my ($vt_positives, $vt_scandate, $response_vt);
my $url='https://www.virustotal.com/vtapi/v2/file/report';
my $key="12345...7890";
my $ua = Mojo::UserAgent->new;
my $delay = Mojo::IOLoop->delay;
$ua->max_redirects(0)->connect_timeout(3)->request_timeout(6);
$ua->max_redirects(5);
$delay->begin;
$response_vt = $ua->post( $url => ['apikey' => $key, 'resource' => $md5] => sub {
my ($ua, $tx) = #_;
$vt_positives=$tx->res->json->{"positives"};
print "Got response: $vt_positives\n";
});
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
The first code is OK, the second isn't working. I must be doing something wrong when sending the request since I seem to get a 403 response (incorrect API usage). I also tried -> json calls but it didn't work out.
And even if I had done the request correctly, I'm not sure if I'm correctly decoding the json results with Mojo.
Help will be appreciated!
EDIT
It seems that we missed the real question, how to post forms. Oops sorry about that.
Posting forms depends on which version of Mojolicious you are using. Until recently (v3.85 -- 2013-02-13) there was a post_form method. On reflection however, it was decided there should either be *_form methods for every request type, or we should do something smarter, and thus the form generator was born.
$response_vt = $ua->post(
$url,
form => {'apikey' => $key, 'resource' => $md5},
sub { ... }
);
It can be added to any request method, making it much more consistent than the old form. Also note that it should be a hashref, not an arrayref as LWP allows. BTW there is also a json generator that works like this too, or you can even add your own!
I'm leaving my original answer, showing non-blocking usage, which you may now amend given the above.
ORIGINAL
Building off the logic from creaktive, this is how I would start. The major difference is that there isn't a monitor watching to be sure that there are works going, rather when one finishes it checks to be sure that there are no idlers.
I have also made some changes in the parsing logic, but nothing major.
#!/usr/bin/env perl
use Mojo::Base -strict;
use utf8::all;
use Mojo::URL;
use Mojo::UserAgent;
# FIFO queue
my #urls = qw(
http://sysd.org/page/1/
http://sysd.org/page/2/
http://sysd.org/page/3/
);
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent
->new(max_redirects => 5)
->detect_proxy;
start_urls($ua, \#urls, \&get_callback);
sub start_urls {
my ($ua, $queue, $cb) = #_;
# Limit parallel connections to 4
state $idle = 4;
state $delay = Mojo::IOLoop->delay(sub{say #$queue ? "Loop ended before queue depleated" : "Finished"});
while ( $idle and my $url = shift #$queue ) {
$idle--;
print "Starting $url, $idle idle\n\n";
$delay->begin;
$ua->get($url => sub{
$idle++;
print "Got $url, $idle idle\n\n";
$cb->(#_, $queue);
# refresh worker pool
start_urls($ua, $queue, $cb);
$delay->end;
});
}
# Start event loop if necessary
$delay->wait unless $delay->ioloop->is_running;
}
sub get_callback {
my ($ua, $tx, $queue) = #_;
# Parse only OK HTML responses
return unless
$tx->res->is_status_class(200)
and $tx->res->headers->content_type =~ m{^text/html\b}ix;
# Request URL
my $url = $tx->req->url;
say "Processing $url";
parse_html($url, $tx, $queue);
}
sub parse_html {
my ($url, $tx, $queue) = #_;
state %visited;
my $dom = $tx->res->dom;
say $dom->at('html title')->text;
# Extract and enqueue URLs
$dom->find('a[href]')->each(sub{
# Validate href attribute
my $link = Mojo::URL->new($_->{href});
return unless eval { $link->isa('Mojo::URL') };
# "normalize" link
$link = $link->to_abs($url)->fragment(undef);
return unless grep { $link->protocol eq $_ } qw(http https);
# Don't go deeper than /a/b/c
return if #{$link->path->parts} > 3;
# Access every link only once
return if $visited{$link->to_string}++;
# Don't visit other hosts
return if $link->host ne $url->host;
push #$queue, $link;
say " -> $link";
});
say '';
return;
}
Take a look at this concurrent-requesting Mojolicious-based web crawler I wrote to illustrate my article Web Scraping with Modern Perl:
#!/usr/bin/env perl
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);
use Mojo::UserAgent;
# FIFO queue
my #urls = map { Mojo::URL->new($_) } qw(
http://sysd.org/page/1/
http://sysd.org/page/2/
http://sysd.org/page/3/
);
# Limit parallel connections to 4
my $max_conn = 4;
# User agent following up to 5 redirects
my $ua = Mojo::UserAgent
->new(max_redirects => 5)
->detect_proxy;
# Keep track of active connections
my $active = 0;
Mojo::IOLoop->recurring(
0 => sub {
for ($active + 1 .. $max_conn) {
# Dequeue or halt if there are no active crawlers anymore
return ($active or Mojo::IOLoop->stop)
unless my $url = shift #urls;
# Fetch non-blocking just by adding
# a callback and marking as active
++$active;
$ua->get($url => \&get_callback);
}
}
);
# Start event loop if necessary
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
sub get_callback {
my (undef, $tx) = #_;
# Deactivate
--$active;
# Parse only OK HTML responses
return
if not $tx->res->is_status_class(200)
or $tx->res->headers->content_type !~ m{^text/html\b}ix;
# Request URL
my $url = $tx->req->url;
say $url;
parse_html($url, $tx);
return;
}
sub parse_html {
my ($url, $tx) = #_;
say $tx->res->dom->at('html title')->text;
# Extract and enqueue URLs
for my $e ($tx->res->dom('a[href]')->each) {
# Validate href attribute
my $link = Mojo::URL->new($e->{href});
next if 'Mojo::URL' ne ref $link;
# "normalize" link
$link = $link->to_abs($tx->req->url)->fragment(undef);
next unless grep { $link->protocol eq $_ } qw(http https);
# Don't go deeper than /a/b/c
next if #{$link->path->parts} > 3;
# Access every link only once
state $uniq = {};
++$uniq->{$url->to_string};
next if ++$uniq->{$link->to_string} > 1;
# Don't visit other hosts
next if $link->host ne $url->host;
push #urls, $link;
say " -> $link";
}
say '';
return;
}
LWP::UserAgent takes arguments to post as either ref to array or ref to hash format.
http://search.cpan.org/~gaas/libwww-perl-6.04/lib/LWP/UserAgent.pm#REQUEST_METHODS
$ua->post( $url, \%form )
$ua->post( $url, \#form )
which you provide in the first script in the ref to array format "\#form"
my $response = $ua->post( $url, ['apikey' => $key, 'resource' => $md5] );
being as it is a hash this is probably better written in the hash format "\%form"
my $response = $ua->post( $url, {'apikey' => $key, 'resource' => $md5} );
In Mojo::UserAgent the arguments to post are a little more complex, but essentially appear to be a "string" of hash refs to hash keys, with which I am unfamiliar. However you may find using the hash ref format provides the expected arguments correctly.
http://search.cpan.org/~sri/Mojolicious-3.87/lib/Mojo/UserAgent.pm#post
POST
my $tx = $ua->post('kraih.com');
my $tx = $ua->post('http://kraih.com' => {DNT => 1} => 'Hi!');
my $tx = $ua->post('http://kraih.com' => {DNT => 1} => form => {a => 'b'});
my $tx = $ua->post('http://kraih.com' => {DNT => 1} => json => {a => 'b'});
try this ?:
$response_vt = $ua->post( $url => form => {'apikey' => $key, 'resource' => $md5} => sub {... });