- use Encode;
- use Modern::Perl;
- use File::Slurp;
- use Mojo::UserAgent;
- use File::Basename qw/basename/;
- use File::Path qw/mkpath/;
- STDOUT->autoflush(1);
-
- our $ua = Mojo::UserAgent->new();
- our $main = "http://www.court.gov.cn";
- our $wdir = "F:/temp/gov_wenshu";
- mkpath $wdir unless -e $wdir;
-
- #获取尾页代码,前缀
- my ($prefix, $maxpg) = get_max_pgcode( $main ."/wenshu.html" );
- for my $id ( 1 .. $maxpg ) {
- printf "${main}${prefix}$id.html\n";
- get_article( "${main}${prefix}$id.html" );
- }
-
- sub get_article
- {
- our ($main, $wdir);
- my ( $link ) = @_;
- my $res;
- my $fpath;
- my $dom = $ua->get( $link )->result->dom;
- for my $e ( $dom->find(".list .l li a")->each )
- {
- printf "%s\n", basename($e->attr("href"));
- $fpath = $wdir ."/". basename($e->attr("href"));
- next if ( -e $fpath );
- $res = $ua->get( $main . $e->attr("href") )->result;
- write_file( $fpath, $res->body );
- }
- }
-
- sub get_max_pgcode
- {
- my ( $link ) = @_;
- my $res = $ua->get( $link )->result;
- my $href = $res->dom->at(".yiiPager .last a")->attr("href");
- if ($href =~/^(.*\/)(\d+)\.html/) { return ($1, $2); }
- else { printf "Failed to get max page code\n"; return undef }
- }
复制代码
|