本帖最后由 apang 于 2015-6-13 14:01 编辑
保存为test.bat,需要先下载HtoX32c.exe与批处理放在一起,处理后的txt为ansi编码- @set @n=0;/* & echo off
- md Result 2>nul
- pushd Result\
- cscript -nologo -e:jscript "%~0"
- "%~dp0HtoX32c" /IP /O0 *.html 2>nul
- del *.html
- pause & exit/b & rem */
-
- var url = "http://www.baidu.com/s?wd=title:管理 site:www.fjdh.cn&pn=";
- var intTotal = 0;
- var re = /"title":"(.+?)","url":"(.+?)"/ig;
-
- //获取百度搜索结果中第1~61页的标题、地址,以标题名称作为html文件名
- for(var i=0; i<61; i++){
- var http = new ActiveXObject("MSXML2.ServerXMLHTTP");
- try {
- http.open("GET", url + i*10, false);
- http.send();
- var s = GetText(http.responseBody);
- while((arr = re.exec(s)) != null){
- http.open("GET", arr[2], false);
- http.send();
- SaveFile(http.responseBody, arr[1]);
- }
- } catch(e) {WScript.Echo(e.message)}
- http = null;
- }
-
- function GetText(bin){
- with(new ActiveXObject("ADODB.Stream")){
- Mode = 3;
- Type = 1;
- Open();
- Write(bin);
- Position = 0;
- Type = 2;
- CharSet = "UTF-8";
- return ReadText()
- }
- }
-
- //下载、保存html文件
- function SaveFile(bin, strName){
- strName = strName.replace(/[\/\|\\:<>\?\*]/g, ""); //格式化文件名
- var fso = new ActiveXObject("Scripting.FileSystemObject");
- if(fso.FileExists(strName + ".html")) {return ""} else intTotal+=1;
- WScript.Echo((intTotal+1000+"").substr(1) + " " + strName + ".html");
-
- with(new ActiveXObject("ADODB.Stream")){
- Mode = 3;
- Type = 1;
- Open();
- Write(bin);
- Position = 0;
- Type = 2;
- CharSet = "UTF-8";
- //保留“正文内容”到“标签:”之间的内容
- var txt = ReadText().replace(/[\s\S]+>正文内容<.*\n/, "").split(">标签:<")[0];
- txt = txt.replace(/&[a-z]+;/g, ""); //删除导致乱码的字符
- Position = 0;
- CharSet = "GBK";
- WriteText(txt);
- SetEOS;
- SaveToFile(strName + ".html", 2);
- }
- }
复制代码
|