#!/usr/bin/perl # make perl reasonably restrictive use strict; use warnings; # parsers use HTML::Form; use HTML::TreeBuilder; use CGI (qw/escape/); # modules for outbound requests require HTTP::Request; use LWP::UserAgent; ########################################################## # start from there my $base_uri="http://www.gem.or.at/cgi-bin/gem.pl?aktion=suche&" . "funktion=maske&sprache=de"; # for requests my ($ua, $request, $response); my (@forms, $form, %sc_urls); ########################################################## # make a request to collect the dropdown options $request = HTTP::Request->new(GET => "$base_uri"); $ua = LWP::UserAgent->new; $response = $ua->request($request); # for testing: local version #$response = cat("gem.html"); @forms = HTML::Form->parse($response, $base_uri); # subroutine (see below) for extraction my %out=listoptions($forms[0],"stichwort1"); ########################################################## # q'n'd rss header ... print <<"EOF"; EOF my $escaped_uri=escape($base_uri); for my $stichwort (keys %out) { if ($stichwort) { print <<"EOF"; $out{$stichwort} $escaped_uri EOF print STDERR ">>>>>>>> $stichwort\n"; $forms[0]->param('stichwort1',$stichwort); $ua = LWP::UserAgent->new; $forms[0]->action =~ m#(.*/)#; my $base_uri=$1; # caching my $content; if (-f "gem-db-$stichwort.html") { $content=cat("gem-db-$stichwort.html"); } else { $response = $ua->request($forms[0]->click); open O,">gem-db-$stichwort.html"; print O $response->content; close O; $content=$response->content; } my $tree=HTML::TreeBuilder->new_from_content($content); for ([$tree->find('table')]->[3]->find('td')) { my $link=$base_uri.[$_->extract_links]->[0][0][0]; if ($link =~ /gem.pl/) { #my $content=join(" ",keys %$_); # _parent _content align valign _tag my $content=${$$_{_content}[0]}{"_content"}[0]; #>>>>>>>> 51 #Use of uninitialized value in concatenation (.) or string at # ./gender-rdf.pl line 61. #Use of uninitialized value in concatenation (.) or string at # ./gender-rdf.pl line 61. #Can't use string ("Es wurde kein Eintrag in der Dat") as a HASH ref # while "strict refs" in use at ./gender-rdf.pl line 64. if ($content && $link) { $escaped_uri=escape($link); print <<"EOF"; $content $escaped_uri EOF } } #print [$_->find("td")]->[0]->attr("text"); #print " > ",$_->find("a")->attr("text"),"\n"; } print <<"EOF"; EOF print STDERR "<<<<<<<< $stichwort\n"; } } ########################################################## # q'n'd rss footer ... print <<"EOF"; EOF sub cat { my ($name, $oldifd, $data); ($name)=@_; $oldifd=$/; $/=undef; open I,$name; $data=; close I; $/=$oldifd; return $data; } sub listoptions ($$) { my ($form,$name)=@_; my %liste; my @values=$form->find_input($name)->possible_values; my @names=$form->find_input($name)->value_names; for my $i (0..$#names) { $liste{$values[$i]}=$names[$i]; } return %liste; }