#!/usr/bin/perl # Script to extract housing information from Woningnet website. # # Requirements: perl5 with HTML::TableExtract, LWP::Simple, # DBI, DBD::DBase, DBIx::Recordset. # # The idea is that you first save all the list-pages with links to info about # individual houses on your local disk (using your browser), then # run this script with filenames as arguments (i.e. 'leech-woningnet *.html') # # This script will automatically get the individual info-pages and stick all # the info in a table. # I chose to use DBI (what else?) to create that table, but since I wanted # to keep it simple and not use a real RDBMS, I chose the DBase driver. # You can load the created DBase files in your favourite spreadsheet # application and mess around with them. # # Note there's some bug in this script which I don't know how to get out: # After I've created a table (i.e. DBASE file), I can't insert the data in the # table anymore. So I just dump the gathered data to a file and tell you that # you need to run the script again to insert the data in the already-created # table... (You'll see it when you run the script.) # # (You could adjust this script to get all the list-pages on your computer # automatically using HTTP GET, but then you cannot login and filter out # the nice ones in the initial search page. I tried logging in using HTTP POST, # but that didn't work. See bottom for that non-working code.) # use strict; use HTML::TableExtract; use LWP::Simple; #use XML::Simple; use DBIx::Recordset; use Data::Dumper; ## ## 'configuration': ## (no need to adjust anything here - you are just supposed to create a dir) ## my $Database = 'woningen'; # In DBASE case, this 'database' is my $Table = 'houses'; my $Dumpfile = 'woningen/leech.dmp'; my $DBI_Datasource = 'DBI:XBase:' . $Database; if ($DBI_Datasource =~ /XBase/) { # test for dbase die "directory '$Database' does not exist!\n" unless -d $Database; } my @houses_data; my @FIELDORDER; ### Get all information on houses into an array of hashes==houses # Don't do any real command line processing, just check for --readdump if ($ARGV[0] eq '--readdump') { my $VAR1; my $code = `cat $Dumpfile`; eval $code; @houses_data = @$VAR1; } else { ### get a;; for my $file (@ARGV) { print STDERR "* Doing $file\n"; my $html_string = `cat $file`; my $te = new HTML::TableExtract( headers => ['adres'], keep_html => 1 ); $te->parse($html_string); foreach my $ts ($te->table_states) { foreach my $row ($ts->rows) { my ($url) = $row->[0] =~ /a\s+href=\'?\"?(.*?)\'?\"?\>/i; my $data = process_url($url); $data = xbase_manglehash($data); ## push @houses_data,$data; # end processing one row in index == one html page } } # end processing all rows in all matching tables in index } } ### now do processing of array of hashes # print XMLout( { 'data' => \@houses_data} ); my $db = DBIx::Database -> new ({'!DataSource' => $DBI_Datasource, '!KeepOpen' => 1}) or die; my $val = eval {*rs = DBIx::Recordset -> Setup ( { '!DataSource' => $db, '!Table' => $Table }); }; if (!defined($val)) { die $@ . "\n" if $@ =~ /table $Table not found/; warn "Datasource not found - trying to create table\n"; DBIx::Recordset::Undef ('rs') ; my $statement = xbase_create_statement(\@houses_data); $db->do($statement); *rs = DBIx::Recordset -> Setup ( { '!DataSource' => $db, '!Table' => $Table }); warn "\n\nSorry -- there's a bug in this program: it does not insert the data into the DBF after it is first created! Run '$0 --readdump' now to insert the data!!!\n"; open DUMP,">$Dumpfile" or die "open dump for write: $!\n"; print DUMP Dumper(\@houses_data); close DUMP; } foreach my $data (@houses_data) { $main::rs->Insert($data); } exit 0; ###### sub xbase_manglehash { # adjust data in hash to be suitable for XBASE file # fieldnames max 10 chars, all upper case # my ($hashref_in) = @_; my %hash_out; while (my ($field, $data) = each %$hashref_in) { $hash_out{xbase_manglefield($field)} = $data; } return \%hash_out; } sub xbase_manglefield { my ($field) = @_; $field = uc($field); $field = substr($field,0,10) if length($field) > 10; return $field; } sub xbase_create_statement { # execute 'create table' statement # (Fields will be all char() - and have the minimum length to hold all info) my ($arrayref_in) = @_; my %maxlength; my $dataset; foreach $dataset (@$arrayref_in) { while (my ($field, $data) = each %$dataset) { if ($field !~ /^\!/) { # do not process special fields (starting with !) $maxlength{$field} = length($data) if $maxlength{$field} < length($data); } } } my $fieldspec; # We want to do the fields in the 'create table' statement in the right order # but 'each' messes the order up -- so first process the hash fields you # have in @FIELDORDER while (my $field = shift @FIELDORDER) { $field = xbase_manglefield($field); if(exists($maxlength{$field})) { my $len = $maxlength{$field}; $fieldspec .= " $field char($len),"; delete $maxlength{$field}; } } while (my ($field, $len) = each %maxlength) { $fieldspec .= " $field char($len),"; } substr($fieldspec,-1) = ' '; return "create table $Table ($fieldspec);" ; } #### sub process_url { my ($url) = @_; my $empty_fields = 0; my %data; # Remember field order by sticking fieldnames in array. # (This may be necessary for 'create table' statement) # Do this only the first time. my $determine_fieldorder = $#FIELDORDER = -1; $url =~ s/\.\.\/\.\./http:\/\/roa.woningnet.nl/; print STDERR "Retrieving $url...\n"; my $content = get($url); if (defined($content)) { # Take all tables with depth 2. (At this moment, the first is empty, # the second ('header' in HTML page) contains 'stadsdeelraad' # and the third contains all other data. my $te = new HTML::TableExtract( depth => 2 ); $te->parse($content); # print "--------\n"; foreach my $ts ($te->table_states) { # print "Table (", join(',', $ts->coords), "):\n"; foreach my $row ($ts->rows) { # print $row->[0]; # print join(',', @$row), "\n"; my $field = shift @$row; my $content = join("\n", @$row); next if ($field=~/^\s*$/ and $content=~/^\s*$/); # do content $content = manglecontent($content); if ($content eq '') { next if ($field eq 'Voorwaarden' or $field eq 'Kenmerken' or $field eq 'Reageren'); $content = manglecontent($field); $field = ''; } # do field $field =~ s/\xA0/ /g; #   $field =~ s/\s//g; $field =~ s/\://g; if ($field eq '') { $field = 'Field' . ++$empty_fields; } while (exists($data{$field})) { $field .= '_'; } # on very first call to this routine, determine order of fields push @FIELDORDER, $field if $determine_fieldorder; $data{$field} = $content; } } # 'Verhuurder' field for some reason has 2 differend kinds of data my ($verhuurder,$soort) = $data{'Verhuurder'}=~/^(.*?)\n(.*)$/; $data{'Verhuurder'} = $verhuurder; $data{'Soort'} = $soort; return \%data; } else { print STDERR "Could not get $url!\n"; } } sub manglecontent { $_ = $_[0]; s/\xA0/ /g; #   s/\xB2/2/g; # kwadraat s/\x83/Fl./g; s/^\s+//; s/\s+$//; s/\s+\n/\n/g; s/\n\s+/\n/g; return $_; } __END__ # just for debugging sub dotabledata_print { my ($te) = @_; # Examine all matching tables foreach my $ts ($te->table_states) { print STDERR "Table (", join(',', $ts->coords), "):\n"; foreach my $row ($ts->rows) { print join(',', @$row), "\n"; } } } ################################################## # Below code does not work. I tried to get the index pages using POST method, # with the idea of letting the program do the logging in, but it fails. I get # the wrong data back and I don't know why. If anyone wants to try fixing this, # be my guest and let me know... use strict; use HTTP::Request::Common; require LWP::UserAgent; #################################### Verander deze waarden my $REGNR = '039203456'; my $POSTCODE = '1000 AB'; my $PASSWORD = 'Bla04567'; #################################### # There's several slightly different ways to do POST requests, # but this seems to be the most 'common/simple' one... # (using HTTP::Request::Common) my $ua = LWP::UserAgent->new; #my $res = $ua->request(GET 'http://roa.woningnet.nl/reacties/aanmelden.asp',); #warn "aanmeldpagina niet geladen!\n" unless $res->is_success; #print $res->content; print "####################\n"; my $res = $ua->request(GET 'http://roa.woningnet.nl/aanbod/selecteren.asp?pgnr=2'); #my $res = $ua->request(POST 'http://roa.woningnet.nl/reacties/checkreginfo.asp', [ FRegNr => $REGNR, FPostCode => $POSTCODE, FPwd => $PASSWORD ]); warn "inloggen niet gelukt!\n" unless $res->is_success; print $res->content;