[新手上路]批处理新手入门导读[视频教程]批处理基础视频教程[视频教程]VBS基础视频教程[批处理精品]批处理版照片整理器
[批处理精品]纯批处理备份&还原驱动[批处理精品]CMD命令50条不能说的秘密[在线下载]第三方命令行工具[在线帮助]VBScript / JScript 在线参考
返回列表 发帖

[其他] 使用第三方命令爬取最高人民法院裁判文书

本帖最后由 pcl_test 于 2019-2-8 11:01 编辑

感谢贴
十分感谢本论坛 由于工作需要 春节假期用批处理和论坛提供的sed、iconv工具编写了一款爬取最高法裁判文书的爬虫脚本 总共1万多份裁判文书都可以在本地进行搜索查询了 虽然只是静态网页爬虫但十分有成就感 今后争取能学会动态网页爬虫技术 哈哈 下面上代码
  1. set url_1=http://www.court.gov.cn/wenshu.html
  2. echo # target = %url_1%
  3. curl %url_1%>temp.tmp 2>nul
  4. iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
  5. sed -r -i "s/[[:space:]]//g" gbk.tmp
  6. sed -n -r "/<liclass=\"last\">.*page/p" gbk.tmp >temp.tmp
  7. sed -r -i "s/.*page.([0-9]+)(\.html)?\".*/\1/" temp.tmp
  8. set /p pages=<temp.tmp
  9. sed -n -r "/共收录<font>([0-9]+)<\/font>份/p" gbk.tmp >temp.tmp
  10. sed -r -i "s/.*共收录<font>([0-9]+)<\/font>份.*/\1/" temp.tmp
  11. set /p articles=<temp.tmp
  12. echo # find !pages! pages , !articles! articles
  13. set /p down_number=# input the number of latest articles to down ^( 1 - !articles! ^) :
  14. set n=1
  15. :loop_begin
  16. if !n! GTR !pages! goto loop_end
  17. curl %url_1%?page=!n!>temp.tmp 2>nul
  18. iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
  19. sed -r -i "s/[[:space:]]//g" gbk.tmp
  20. sed -n -r -i "/target=\"_blank\"href=\".*html/p" gbk.tmp
  21. sed -r -i "s/.*target=\"_blank\"href=\"(.*html).*/\1/" gbk.tmp
  22. type gbk.tmp>>link.bak
  23. set /a n+=1
  24. set lines=0
  25. for /f %%i in (link.bak) do set /a lines+=1
  26. if !lines! GEQ !down_number! (
  27.     goto loop_end
  28. ) else (
  29.     goto loop_begin
  30. )
  31. :loop_end
  32. set start_time=!time!
  33. set n=0
  34. :down_begin
  35. set /a down_number-=1
  36. if !down_number! LSS 0 goto end
  37. set url=""
  38. set /p url=<link.bak
  39. if !url!=="" goto end
  40. curl http://www.court.gov.cn!url! >temp.tmp 2>nul
  41. iconv -c -f utf-8 -t gbk//ignore temp.tmp >gbk.tmp
  42. sed -r -i "s/[[:space:]]//g" gbk.tmp
  43. set t=""
  44. sed -n "/<divclass=\"title\">/p" gbk.tmp>title.tmp
  45. sed -r -i "s/<[^>]*>//g" title.tmp
  46. sed -i "s/:/:/g" title.tmp
  47. sed -i "s/(/(/g" title.tmp
  48. sed -i "s/)/)/g" title.tmp
  49. set /p t=<title.tmp
  50. sed -n -i "/<divclass=\"txt_txt\"id=\"zoom\">/,/\[CDATA\[/p" gbk.tmp
  51. sed -i "s/&nbsp;//g" gbk.tmp
  52. sed -r -i "s/<[^>]*>/\n/g" gbk.tmp
  53. sed -n -i "1,/^二〇.*年.*月.*日/p" gbk.tmp
  54. sed -r -i "/^$/d" gbk.tmp
  55. sed -i "s/(/(/g" gbk.tmp
  56. sed -i "s/)/)/g" gbk.tmp
  57. sed -n -r "/^([0-9][0-9][0-9][0-9]).*号$/p" gbk.tmp >num.tmp
  58. set file_number=""
  59. set /p file_number=<num.tmp
  60. sed -r -i "s/^(.*)/    \1/" gbk.tmp
  61. ren gbk.tmp "!file_number! !t!".txt 2>nul
  62. set /a n+=1
  63. sed -i "1 d" link.bak
  64. echo # !n! articles down
  65. goto down_begin
  66. :end
  67. del link.bak
  68. del *.tmp
  69. echo # mission start at !start_time! end at !time! , !n! succeed
  70. pause>nul
复制代码
2

评分人数

民工一枚

无聊 来个娱乐版的。在线式的。不下载到硬盘。内存存取。可以返回。不特别优化,纯属添加功能。跟WIN10学的。这些东西不想下载到硬盘。
:按两次<Enter>显示选定项的内容;按<Enter>+<BackSpace>返回上一层。不带窗口按纽的。
  1. # 配置
  2. $title = 'class="title|fl print';
  3. $Web = New-Object System.Net.Webclient;
  4. $Web.Encoding = [Text.Encoding]::UTF8;
  5. $url = 'http://www.court.gov.cn/paper/default/index.html';
  6. $Master = 'http://www.court.gov.cn'
  7. $http = (split-path $url).Replace('\','/');
  8. #
  9. Function ANJian_ZenLi {
  10. Param ( [string]$url )
  11. [Collections.Arraylist] $arr = @();
  12. [Collections.Arraylist] $array = @();
  13. $Content = $null;
  14. $Page = $Web.DownloadString( $url ) -Split "`n";
  15. $array = $Page -Match $title | %{ $_.Split('"')[-1] -Replace '(</?(li|div))?>','' };
  16. $array.Add('') | Out-Null;
  17. $Content = $Page -Match '</div><div style' -Replace ";'>","`r`n@" -Replace '</div>',"`r`n";
  18. $arr = ($Content -Split "`n" -Match '^@') -NotMatch '^@<' -Replace '^@','' `
  19. -Replace '&times;','X' -Replace '&ldquo;','“' -Replace '&rdquo;','”' -Replace '&hellip;','…';
  20. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  21. $array = $array + $arr;
  22. $Content = ( $Page -Match '^\s+.*</div>$') -NotMatch '<div';
  23. $arr = $Content -Replace '^\s+|</div>|&nbsp;','' -Replace '&times;','X' `
  24. -Replace '&ldquo;','“' -Replace '&rdquo;','”' -Replace '&hellip;','…';
  25. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  26. $array = $array + $arr;
  27. $arr = $Page -Match '<(div|p) style.*</(div|p)>$' | %{ $_.Split('"')[-1] -Replace '(</(p|div))?>|&nbsp;','' };
  28. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  29. $array = $array + $arr;
  30. $array.Add('*************************完成*************************') | Out-Null;
  31. '';$array;
  32. };
  33. Function InputLine {
  34. $num = Read-Host -Prompt "输入内容序号,1 - $n ;按<Enter>+<退格>返回上一层 按2次<Enter>显示内容";
  35. if ( [Console]::ReadKey($true).Key -eq 'BackSpace' ) { cls;InputPage };
  36. $num = $num -as [int32];
  37. if ( !$num -or ($num -lt 1 -or $num -gt $n) ) { CLS;'超出范围,重新输入'; InputLine };
  38. $Url = $XianQingArray[$num-1][0];cls;
  39. ANJian_ZenLi $Url;pause;cls;
  40. Write-Host " 第 $Number 页:如下" -fore Green;
  41. For ( $k =0; $k -lt $XianQingArray.Count; $k++ ) {
  42. '';  Write-Host " <$($XianQingArray[$k][1])> " -Fore red -NoNewLine;
  43. Write-Host $($XianQingArray[$k][2]) -Fore DarkYellow; '';
  44. };
  45. InputLine;
  46. };
  47. Function InputPage {
  48. $Specify = Read-Host -Prompt "输入某一个页面,1 - $Last ";
  49. [Collections.Arraylist] $XianQingArray = @();
  50. $Number = $Specify -as [int32];
  51. if ( !$Number  -or ($Number -lt 1 -or $Number -gt $Last) ) { CLS;'超出范围,重新输入'; InputPage };
  52. cls; $Page = $Web.DownloadString( $PageArray[$Specify-1] ) -Split "`n";
  53. $Content = $Page -Match 'xiangqing';
  54. $n = 0;
  55. $Content | %{ $n++
  56. $link = $Master + $_.Split('"')[-2];
  57. $str = $_.Split('"')[-1] -Replace '(</(a|li))?>','';
  58. $XianQingArray += ,($link,$n,$str);
  59. };
  60. cls; Write-Host " 第 $Number 页:如下" -fore Green;
  61. For ( $k =0; $k -lt $XianQingArray.Count; $k++ ) {
  62. '';  Write-Host " <$($XianQingArray[$k][1])> " -Fore red -NoNewLine;
  63. Write-Host $($XianQingArray[$k][2]) -Fore DarkYellow; '';
  64. };
  65. InputLine;
  66. };
  67. $Page = $Web.DownloadString( $url ) -Split "`n";
  68. [int]$Last = ($Page -Match '尾页' -Replace '\D','') -join '';
  69. [Collections.Arraylist] $PageArray = @($url);
  70. (2..$Last) | %{ $PageArray += $http + '/index/page/' + "$_.html" };
  71. InputPage;
复制代码

TOP

  1. use Encode;
  2. use Modern::Perl;
  3. use File::Slurp;
  4. use Mojo::UserAgent;
  5. use File::Basename qw/basename/;
  6. use File::Path qw/mkpath/;
  7. STDOUT->autoflush(1);
  8. our $ua = Mojo::UserAgent->new();
  9. our $main = "http://www.court.gov.cn";
  10. our $wdir = "F:/temp/gov_wenshu";
  11. mkpath $wdir unless -e $wdir;
  12. #获取尾页代码,前缀
  13. my ($prefix, $maxpg) = get_max_pgcode( $main ."/wenshu.html" );
  14. for my $id ( 1 .. $maxpg ) {
  15.     printf "${main}${prefix}$id.html\n";
  16.     get_article( "${main}${prefix}$id.html" );
  17. }
  18. sub get_article
  19. {
  20.     our ($main, $wdir);
  21.     my ( $link ) = @_;
  22.     my $res;
  23.     my $fpath;
  24.     my $dom = $ua->get( $link )->result->dom;
  25.     for my $e ( $dom->find(".list .l li a")->each )
  26.     {
  27.         printf "%s\n", basename($e->attr("href"));
  28.         $fpath = $wdir ."/". basename($e->attr("href"));
  29.         next if ( -e $fpath );
  30.         $res = $ua->get( $main . $e->attr("href") )->result;
  31.         write_file( $fpath, $res->body );
  32.     }
  33. }
  34. sub get_max_pgcode
  35. {
  36.     my ( $link ) = @_;
  37.     my $res = $ua->get( $link )->result;
  38.     my $href = $res->dom->at(".yiiPager .last a")->attr("href");
  39.     if ($href =~/^(.*\/)(\d+)\.html/) { return ($1, $2); }
  40.     else { printf "Failed to get max page code\n"; return undef }
  41. }
复制代码

TOP

回复 2# xczxczxcz

这是什么语言?感觉比较厉害
民工一枚

TOP

这个有点感兴趣。写一个PS的
  1. # 配置
  2. $title = 'class="title|fl print';
  3. $Web = New-Object System.Net.Webclient;
  4. $Web.Encoding = [Text.Encoding]::UTF8;
  5. $url = 'http://www.court.gov.cn/paper/default/index.html';
  6. $Master = 'http://www.court.gov.cn'
  7. $http = (split-path $url).Replace('\','/');
  8. $save = '.\案件记录.log';
  9. NI $save -type file -force | Out-Null;
  10. #
  11. Function ANJian_ZenLi {
  12. Param ( [string]$url )
  13. [Collections.Arraylist] $arr = @();
  14. [Collections.Arraylist] $array = @();
  15. $Content = $null;
  16. $Page = $Web.DownloadString( $url ) -Split "`n";
  17. $Content = ( $Page | SLS -Pattern $title) -Split "`r`n";
  18. $array = $Content | %{ $_.Split('"')[-1] -Replace '(</?(li|div))?>','' };
  19. $array.Add('') | Out-Null;
  20. $Content = $Page |SLS -Pattern '</div><div style';
  21. $Content = $Content -Replace ";'>","`r`n@" -Replace '</div>',"`r`n";
  22. $arr = (($Content -Split "`r`n" | sls -Pattern '^@') -NotMatch '^@<') -Replace '@','';
  23. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  24. $array = $array + $arr;
  25. $Content = (( $Page | SLS -Pattern '^\s+.*</div>$') -NotMatch '<div') -Split "`r`n";
  26. $arr = $Content | %{ ($_ -Replace '^\s+|</div>|&nbsp;','').Trim() };
  27. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  28. $array = $array + $arr;
  29. $Content = ( $Page | SLS -Pattern '<(div|p) style.*</(div|p)>$') -Split "`r`n";
  30. $arr = $Content | %{ $_.Split('"')[-1] -Replace '(</(p|div))?>|&nbsp;','' };
  31. $arr = $arr -notmatch '^$'; if ( $arr ) { $arr.Add('') | Out-Null };
  32. $array = $array + $arr;
  33. $array.Add('*************************完成*************************') | Out-Null;
  34. Return $array;
  35. };
  36. $Page = $Web.DownloadString( $url ) -Split "`n";
  37. $Last = (( $Page | SLS -Pattern '尾页' ) -Split "`r`n" ).Split('"')[-2];
  38. [int]$Last = $Last.Split('/.')[-2];
  39. [Collections.Arraylist] $PageArray = @($url);
  40. (2..$Last) | %{ $PageArray += $http + '/index/page/' + "$_.html" };
  41. For ( $i =0; $i -lt $PageArray.Count; $i++ ) {
  42. $DictXianQing = @{};
  43. if ( $i -ge 1 ) { $Page = $Web.DownloadString( $PageArray[$i] ) -Split "`n"; };
  44. $Content = ($Page | SLS -Pattern 'xiangqing') -Split "`r`n";
  45. $Content | %{
  46. $link = $Master + $_.Split('"')[-2];
  47. $str = $_.Split('"')[-1] -Replace '(</(a|li))?>','';
  48. $DictXianQing += @{ "$link" = "$str" };
  49. };
  50. Foreach ( $k in $DictXianQing.Keys.GetEnumerator() | Sort {[int]($_.Split('-.')[-2])} ) {
  51. $Receive = ANJian_ZenLi $k;
  52. $Receive |ac $save -force;
  53. ''|ac $save -force; ''|ac $save -force;
  54. };
  55. '按任意键处理下一页,删除此句会整理所有页面。';pause;
  56. };
  57. '已全部完成 按任意键退出。';pause
复制代码

TOP

返回列表