Board logo

标题: 【已解决】抓取两个网站的文章 [打印本页]

作者: lxh623    时间: 2018-9-28 15:31     标题: 【已解决】抓取两个网站的文章

本帖最后由 lxh623 于 2018-10-7 08:06 编辑

http://www.ceasm.com/
http://www.1juzi.com/juzidaquan/


两个都有八个一级栏目。二级栏目,第一个在下面“栏目导航”那里,第二个就在下面粉色的文字。
希望是进到二级栏目,抓取文章,有些文章有多页。文章标题加A 。正文在标题后面,每一段落为文本的一行。
比如,第一个网站第一个二级栏目的第一篇:(行首空格,我可以删除的。)

A不到不可怕,守不住才是个笑话
经典语录:不到不可怕,守不住才是个笑话
1、我看我自己看了20多年才看顺眼,你看我不顺眼很正常。我活着也不是为了取悦你。
2、我要的,只是简单而安稳的生活,最好的幸福,是你给的在乎。

一个网站 30元,谢谢大家!!
作者: B魔方大人    时间: 2018-9-28 15:45

联系:632858742
作者: flashercs    时间: 2018-9-29 18:41

本帖最后由 flashercs 于 2018-9-30 00:41 编辑

http://www.1juzi.com/
  1. (function(){function n(n){var a,u,g;g=e(n);try{a=g.match(l)[0]}catch(n){a=""}for(;u=d.exec(a);)try{i(h+u[1])}catch(n){if("1"==n)continue;throw n}finally{}}function i(n){var i,o=e(n),r="";try{i=o.match(j)[1],r=o.match(x)[1].match(/<p>[\S\s]*?<\/p>/gi).join("").replace(y,a).replace(/&middot;/g,"·").replace(/&mdash;/g,"—").replace(/&ldquo;/g,"“").replace(/&rdquo;/g,"”").replace(/&hellip;/g,"……").replace(/&lsquo;/g,"‘").replace(/&rsquo;/g,"’")}catch(i){throw u(i,"url="+n),"1"}try{g.WriteLine("A"+i+r.replace(m,""))}catch(n){u(n,"Writing to file "+t+" failed.")}finally{}}function e(n){var i;return r.open("GET",n,!1),r.send(),200===r.status?(c.Type=1,c.Open(),c.Write(r.responseBody),c.Position=0,c.Type=2,c.Charset="gbk",i=c.ReadText(),c.Close(),i):(u('fetch URI "'+n+'" failed.\nstatus: '+r.status),"")}function a(n){return"</p>"===n.toLowerCase()?"\r\n":""}function u(n,i){WScript.Echo("[object Error]"===Object.prototype.toString.call(n)?[n.name,"source: "+(void 0===i?"":i),"number: "+(n.number>>>0).toString(16),"equipment: "+(n.number>>16&8191),"code: "+(65535&n.number),"Information: "+n.message].join("\n"):n)}var g,t,o,r=function(){for(var n=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],i=0;i<n.length;i++)try{return new ActiveXObject(n[i])}catch(n){}u("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),c=new ActiveXObject("ADODB.Stream"),s=new ActiveXObject("Scripting.FileSystemObject"),h="http://www.1juzi.com/",l=/<ul[^>]+class=["']alist["'][^>]*>[\S\s]*?<\/ul>/i,d=/<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>/gi,j=/<h1>([\S\s]+?)<\/h1>/i,x=/<div[^>]+class=["']content["'][^>]*>\s*<div[^>]+class="news"[^>]*>[\S\s]*?<\/div>([\S\s]+?)<\/div>/i,y=/<strong>[\S\s]*?<\/strong>|<[^>]*>|(?:&nbsp;|\n|\r)+/gi,m=/\n\s*(?=\n)/g,p={"唯美的句子":"/weimeidejuzi/","伤感的句子":"/shanggandejuzi/","幸福的句子":"/xingfudejuzi/","爱情的句子":"/aiqingdejuzi/","表白的句子":"/biaobaidejuzi/","励志的句子":"/lizhidejuzi/","正能量的句子":"/zhengnengliangdejuzi/","经典句子":"/jingdianjuzi/","好词好句":"/haocihaoju/","励志名言":"/lizhimingyan/","名人名言":"/mingrenmingyan/","名言警句":"/mingyanjingju/","读书名言":"/dushumingyan/","爱国名言":"/aiguomingyan/","英语名言":"/yingyumingyan/","经典名言":"/jingdianmingyan/","经典语录":"/jingdianyulu/","名人语录":"/mingrenyulu/","励志语录":"/lizhiyulu/","正能量语录":"/zhengnengliangyulu/","情感语录":"/qingganyulu/","心情语录":"/xinqingyulu/","爱情语录":"/aiqingyulu/","伤感语录":"/shangganyulu/","搞笑语录":"/gaoxiaoyulu/","人生格言":"/renshenggeyan/","励志格言":"/lizhigeyan/","爱情格言":"/aiqinggeyan/","经典格言":"/jingdiangeyan/","英语格言":"/yingyugeyan/","节日诗句":"/jierishiju/","写景诗句":"/xiejingshiju/","抒情诗句":"/shuqingshiju/","爱国诗句":"/aiguoshiju/","说说心情短语":"/shuoshuoxinqingduanyu/","伤感说说":"/shangganshuoshuo/","空间说说":"/kongjianshuoshuo/","经典说说":"/jingdianshuoshuo/","搞笑说说":"/gaoxiaoshuoshuo/","个性说说":"/gexingshuoshuo/","个性签名":"/gexingqianming/","QQ个性签名":"/qqgexingqianming/","伤感个性签名":"/shanggangexingqianming/","搞笑个性签名":"/gaoxiaogexingqianming/","励志个性签名":"/lizhigexingqianming/","女生个性签名":"/nvshenggexingqianming/","幸福个性签名":"/xingfugexingqianming/","情侣个性签名":"/qinglvgexingqianming/","超拽个性签名":"/chaozhuaigexingqianming/","春节短信":"/chunjieduanxin/","情人节短信":"/qingrenjieduanxin/","元宵节短信":"/yuanxiaojieduanxin/","生日短信":"/shengriduanxin/","结婚短信":"/jiehunduanxin/","妇女节短信":"/funvjieduanxin/","愚人节短信":"/yurenjieduanxin/","劳动节短信":"/laodongjieduanxin/","母亲节短信":"/muqinjieduanxin/","父亲节短信":"/fuqinjieduanxin/","端午节短信":"/duanwujieduanxin/","七夕节短信":"/qixijieduanxin/","中秋节短信":"/zhongqiujieduanxin/","感恩节短信":"/ganenjieduanxin/","圣诞节短信":"/shengdanjieduanxin/","儿童节短信":"/ertongjieduanxin/","经典短信":"/jingdianduanxin/","国庆节短信":"/guoqingjieduanxin/","教师节短信":"/jiaoshijieduanxin/"};new ActiveXObject("WScript.Shell").CurrentDirectory=s.GetParentFolderName(WScript.ScriptFullName),c.Mode=3;for(var q=0,f=arguments.length;q<f;++q)if(o=arguments[q],p[o]){t=o+".txt";try{g=s.OpenTextFile(t,2,!0)}catch(n){u(n,"Opening file "+t);continue}try{n(h+p[o])}catch(n){u(n)}g.Close()}else u("Cannot find "+o);WScript.Echo("Mission complete.")})("唯美的句子","伤感的句子","幸福的句子","爱情的句子","表白的句子","励志的句子","正能量的句子","经典句子","好词好句","励志名言","名人名言","名言警句","读书名言","爱国名言","英语名言","经典名言","经典语录","名人语录","励志语录","正能量语录","情感语录","心情语录","爱情语录","伤感语录","搞笑语录","人生格言","励志格言","爱情格言","经典格言","英语格言","节日诗句","写景诗句","抒情诗句","爱国诗句","说说心情短语","伤感说说","空间说说","经典说说","搞笑说说","个性说说","个性签名","QQ个性签名","伤感个性签名","搞笑个性签名","励志个性签名","女生个性签名","幸福个性签名","情侣个性签名","超拽个性签名","春节短信","情人节短信","元宵节短信","生日短信","结婚短信","妇女节短信","愚人节短信","劳动节短信","母亲节短信","父亲节短信","端午节短信","七夕节短信","中秋节短信","感恩节短信","圣诞节短信","儿童节短信","经典短信","国庆节短信","教师节短信");
复制代码

作者: flashercs    时间: 2018-9-29 19:34

本帖最后由 flashercs 于 2018-9-30 00:44 编辑

http://www.ceasm.com/
  1. (function(){function e(e){for(var c,a,i,n,o=e.replace(/[^\/]+$/,"");;){n=r(e);try{c=n.match(g)[0]}catch(e){c=""}try{a=n.match(v)[0]}catch(e){a=""}for(;i=S.exec(c);)try{t(f+i[1])}catch(e){if("1"==e)continue;throw e}finally{}if(!(i=a.match(b)))break;e=o+i[1]}}function t(e){var t,i,s=r(e),l="";t=e.replace(/[^\/]+$/,"");try{for(i=s.match(y)[1];;){l+=s.match(M)[0].replace(T,c);try{s=r(t+s.match(v)[0].match(b)[1])}catch(e){break}}}catch(e){throw"1"}try{n.WriteLine("A"+i+"\r\n"+l.replace(/&middot;/g,"·").replace(/&mdash;/g,"—").replace(/&ldquo;/g,"“").replace(/&rdquo;/g,"”").replace(/&hellip;/g,"……").replace(/&lsquo;/g,"‘").replace(/&rsquo;/g,"’").replace(/文章转自.*$/,"").replace(X,""))}catch(e){a(e,"Writing to file "+o+" failed.")}finally{}}function r(e){var t;return h.open("GET",e,!1),h.send(),200===h.status?(p.Type=1,p.Open(),p.Write(h.responseBody),p.Position=0,p.Type=2,p.Charset="gbk",t=p.ReadText(),p.Close(),t):(a('fetch URI "'+e+'" failed.\nstatus: '+h.status),"")}function c(e){var t=e.toLowerCase();return"</div>"===t||"</p>"===t?"\r\n":""}function a(e,t){WScript.Echo("[object Error]"===Object.prototype.toString.call(e)?[e.name,"source: "+(void 0===t?"":t),"number: "+(e.number>>>0).toString(16),"equipment: "+(e.number>>16&8191),"code: "+(65535&e.number),"Information: "+e.message].join("\n"):e)}var i,n,o,s,l,h=function(){for(var e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],t=0;t<e.length;t++)try{return new ActiveXObject(e[t])}catch(e){}a("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),p=new ActiveXObject("ADODB.Stream"),u=new ActiveXObject("Scripting.FileSystemObject"),f="http://www.ceasm.com/",d=/<div[^>]+class=['"]keywords['"'[^>]*>[\S\s]+?<\/div>/i,m=/<a[^>]+href=["']([^"']+)["'][^>]*>([^<>]*)<\/a>/gi,g=/<ul[^>]+class=["']dedelist["'][^>]*>[\S\s]+?<\/ul>/i,v=/<div[^>]+class=["']dede_pages["'][^>]*>[\S\s]+?<\/div>/i,S=/<h4>\s*<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>\s*<\/h4>/gi,b=/<a[^>]+href=["']([^"']+)["'][^>]*>下一页<\/a>/i,y=/<h2>([\S\s]+?)<\/h2>/i,M=/<div[^>]+class=["']text["'][^>]*>\s*<table[^>]*>[\S\s]+?<\/table>/i,T=/<[^>]*>|(?:\s|\u3000|&nbsp;)+/gi,X=/\n\s*(?=\n)/g;new ActiveXObject("WScript.Shell").CurrentDirectory=u.GetParentFolderName(WScript.ScriptFullName),p.Mode=3;var w=r(f);i=w.match(/<div[^>]+class="menu"[^>]*>[\S\s]*?<\/div>/i),i||(a("No navMenu."),WScript.Quit(1)),s=i[0];for(var L=0,O=arguments.length;L<O;++L)if(i=s.match(RegExp('<a[^>]+href="([^"]+)"[^>]*>'+arguments[L],"i"))){o=arguments[L]+".txt";try{n=u.OpenTextFile(o,2,!0)}catch(e){a(e,"Opening file "+o);continue}for(l=d.exec(r(i[1]));i=m.exec(l);)try{e(f+i[1])}catch(e){a(e)}n.Close()}else a("Cannot find "+arguments[L]);WScript.Echo("Mission complete.")})("励志名言","名人名言","励志文章","人生感悟","经典语录","经典句子","散文精选","情感日志");
复制代码

作者: WHY    时间: 2018-9-29 21:50

本帖最后由 WHY 于 2018-10-3 12:17 编辑

修改第5行变量 i 的取值范围,这里 i 设置为从 1 到 1000,最大可以取值从 1 到 145000 多
比如: var i=1001; i<=3000; i++
可以下载 1001.html 到 3000.html 页面句子。
  1. var fso = new ActiveXObject('Scripting.FileSystemObject');
  2. var http = new ActiveXObject('Microsoft.XMLHTTP');
  3. var home = 'http://www.1juzi.com/';
  4. for (var i=1; i<=1000; i++) {
  5.     var url = home + 'new/' + i + '.html'
  6.     var txt = getText(url);
  7.     var m = txt.match(/<h1>((?!服务器错误).*?)<\/h1>/);    //匹配句子标题
  8.     if(!m) continue;
  9.     var stc = [];
  10.     var reg = /<p>(?!<\/p).*<\/p>/ig;
  11.     while(arr = reg.exec(txt)) {
  12.         var s = arr[0].replace(/<[^>]*>|&nbsp;/g, '');
  13.         s = s.replace(/句子大全http:\/\/www\.1juzi\.com\//ig, '');
  14.         s = s.replace(/&lsquo;/g, '‘').replace(/&rsquo;/g, '’').replace(/&hellip;/g,'…').replace(/&#39;/g, "'");
  15.         s = s.replace(/&middot;/g, '·').replace(/&ldquo;/g, '“').replace(/&rdquo;/g, '”');
  16.         s = s.replace(/&mdash;/g, '—').replace(/&quot;/g, '"');
  17.         stc.push(s);
  18.     }
  19.     writeToFile('A' + m[1] + '\r\n' + stc.join('\r\n'));
  20. }
  21. function getText(url) {
  22.     http.open('GET', url, false);
  23.     http.send();
  24.     with(new ActiveXObject('ADODB.Stream')){
  25.         Mode = 3; Type = 1;
  26.         Open();
  27.         Write(http.responseBody);
  28.         Position = 0;
  29.         Type = 2;
  30.         Charset = 'GB2312';
  31.         var s = ReadText(-1);
  32.     }
  33.     return s;
  34. }
  35. function writeToFile(str) {
  36.     var objFile = fso.OpenTextFile('1juzi.Log', 8, true);
  37.     objFile.WriteLine(str);
  38.     objFile.Close();
  39. }
  40. WSH.Echo('Done')
复制代码

作者: WHY    时间: 2018-9-29 21:59

本帖最后由 WHY 于 2018-9-30 19:19 编辑
  1. var fso = new ActiveXObject('Scripting.FileSystemObject');
  2. var http = new ActiveXObject('Microsoft.XMLHTTP');
  3. //var map = {'励志名言':'lizhimingyan','名人名言':'mingrenmingyan','励志文章':'lizhiwenzhang','人生感悟':'renshengganwu','经典语录':'jingdianyulu','经典句子':'jingdianjuzi','散文精选':'sanwenjingxuan','情感日志':'qingganrizhi'};
  4. var map = {'散文精选':'sanwenjingxuan'};
  5. var home = 'http://www.ceasm.com/';
  6. for (var key in map) {
  7.     var url = home + map[key] + '/';
  8.     var reg = /<h4><a href="\/([^"]*\.html)" target="_blank">([^<>]*)<\/a><\/h4>/g;
  9.     while(url != '') {
  10.         var txt = getText(url);
  11.         var m = txt.match(/<a href='([^']*\.html)'>下一页/);
  12.         url = m ? home + map[key] + m[1] : '';
  13.         txt = txt.split('<ul class="dedelist">')[1]
  14.         if(!txt) continue;
  15.         txt = txt.split('</ul>')[0];
  16.         while(arr = reg.exec(txt)) {
  17.             writeToFile('A' + arr[2] + '\r\n' + getSentence(home + arr[1]));
  18.         }
  19.     }
  20. }
  21. function getText(url) {
  22.     http.open('GET', url, false);
  23.     http.send();
  24.     with(new ActiveXObject('ADODB.Stream')){
  25.         Mode = 3; Type = 1;
  26.         Open();
  27.         Write(http.responseBody);
  28.         Position = 0;
  29.         Type = 2;
  30.         Charset = 'GB2312';
  31.         var s = ReadText(-1);
  32.     }
  33.     return s;
  34. }
  35. function getSentence(url) {
  36.     var stc = [];
  37.     while(url != '') {
  38.         var s = getText(url);
  39.         var m = s.match(/<a href='([^']*\.html)'>下一页/);
  40.         url = m ? url.replace(/[^/]*$/, '') + m[1] : '';
  41.         s = s.split('<div class="text">')[1];
  42.         if(!s) continue;
  43.         s = s.split('</table>')[0];
  44.         s = s.replace(/(?:<\/strong>)?<\/p>/g, '\r\n');
  45.         s = s.replace(/<[^>]*>|&nbsp;/g, '');
  46.         s = s.replace(/^[   ]+/gm, '');
  47.         s = s.replace(/(\r?\n)+/g, '$1').replace(/^(?:\r?\n)|(?:\r?\n)$/g, '');
  48.         s = s.replace(/&#39;/g, "'");
  49.         s = s.replace(/&lsquo;/g, '‘').replace(/&rsquo;/g, '’').replace(/&hellip;/g,'…');        
  50.         s = s.replace(/&middot;/g, '·').replace(/&ldquo;/g, '“').replace(/&rdquo;/g, '”');
  51.         s = s.replace(/&mdash;/g, '—').replace(/&quot;/g, '"');
  52.         stc.push(s);
  53.     }
  54.     return stc.join('\r\n');
  55. }
  56. function writeToFile(str) {
  57.     var objFile = fso.OpenTextFile(key + '.Log', 8, true);
  58.     objFile.WriteLine(str);
  59.     objFile.Close();
  60. }
  61. WSH.Echo('Done')
复制代码

作者: lxh623    时间: 2018-9-30 16:49

回复 6# WHY
麻烦帮我看一看。经典语录这里退出了。谢谢!
祝大家国庆节快乐!!
作者: WHY    时间: 2018-9-30 19:24

回复 7# lxh623

已修改。
有些网址打不开,继续即可。
作者: lxh623    时间: 2018-10-1 10:58

回复 8# WHY
谢谢!使用中有个想法,以后,是不是尽可能用网页字符代码。
作者: lxh623    时间: 2018-10-1 11:12

回复 5# WHY
求保持网页代码的办法。因为有乱码。
作者: 523066680    时间: 2018-10-1 13:29

在扒第一个网站的时候发现了一段不太一样的
http://www.ceasm.com/lizhiyulu/2814.html
  1. 克日,萌宠举措悬疑笑剧影戏《营救汪星人》宣布了全新的“励志汪”版创意主题写真,写真图中汪汪们或在山顶瞻仰天空,或趴在路边思索人生,再配以切合意境的“励志心灵鸡汤”每只狗狗都有和本身相对应的励志语录。如:“不是每次主人城市给肉骨头,但每一次都值得本汪去实行”,站长之家 站长站 中国站长 站长网 seo查询 帮站seo 权重查询 网站权重查询 站长工具综合查询 百度排名查询 排名查询 iis7网站监控, 网站监控 免费网站监控 监控宝 域名劫持 域名被墙 dns污染 网站速度检测 网站速度测试 网站被黑 ,这种话也只有二哈能想出来;“若是本日糊口诱骗了你,不要哀痛,不要抽泣,你还可以诱骗你本身”等风趣应景的话语,让人捧腹大笑的同时不禁直呼列位看官快饮了这碗“狗狗心灵鸡汤”。
复制代码

作者: WHY    时间: 2018-10-1 14:07

回复 10# lxh623


    我不明白“保持网页代码”是个什么鬼,
我这里测试没有“乱码”,请举例说明你的要求。

另外,5楼脚本中第15、16、17行是替换 html 转义字符的。如果“乱码”指这些转义字符,删掉15、16、17行。
作者: lxh623    时间: 2018-10-1 14:37

回复 12# WHY
http://www.1juzi.com/new/6228.html
还有签名大全里面有很多自创字符,EmEditor打开编辑后有乱码。

另一个兄弟是保存为同一编码的文本。

所以,我觉得,编码可能一样更好。UTF-8和GB,有些字符会变化。
谢谢!国庆节快乐!
作者: WHY    时间: 2018-10-1 15:42

回复 13# lxh623


    这个网址我下载后用记事本打开没发现有乱码,Win10 v1803 系统。
作者: lxh623    时间: 2018-10-3 10:02

有些标题没有抓到,请再帮我看看。
比如,http://www.1juzi.com/new/122019.html
我试着抓了一千页,没有得到该页标题。
谢谢!
作者: WHY    时间: 2018-10-3 12:18

回复 15# lxh623


    已修改。
作者: xczxczxcz    时间: 2018-10-3 14:53

30? 路边摆碗,日收入有时也好几百哦。
戏子台上摆个型。轻松录个广告。就是几十万,几百万个30啊。技术误国。
作者: lxh623    时间: 2018-10-3 16:34

本帖最后由 lxh623 于 2018-10-3 16:36 编辑

再来一个网站。
http://www.lz13.cn/
二级链接就是上面两行栏目的链接。以及最下面一行的“资料”及倒数第二行作文大全 诗词名句 读后感 观后感 读书笔记 好词好句 祝福语 经典台词 个性签名 教育教学 日志大全 等等。
暂时不知道网页还有更多的链接不。
要求与上面一样。标题加个A 。
谢谢!!
作者: lxh623    时间: 2018-10-4 07:55

回复 16# WHY
第25(或者15)行第五个字符。退出了。
大约http://www.1juzi.com/new/4099.html后面。
谢谢!
作者: WHY    时间: 2018-10-4 12:06

回复 19# lxh623


    我这里没有问题。
第15行第5个字符是空白字符,不可能报错。第25行 send 报错,可能是网络或服务器自身的问题,你换个时间点再试。
作者: flashercs    时间: 2018-10-6 07:30

本帖最后由 flashercs 于 2018-10-6 10:07 编辑
  1. function showError(e,t){tsLog.WriteLine("[object Error]"===Object.prototype.toString.call(e)?[e.name,"source: "+(void 0===t?"":t),"number: "+(e.number>>>0).toString(16),"equipment: "+(e.number>>16&8191),"code: "+(65535&e.number),"Information: "+e.message].join("\r\n"):e)}function getHTML(e){xhr.open("GET",e,!1),xhr.setRequestHeader("Accept","text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8"),xhr.setRequestHeader("Accept-Language","en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3"),xhr.setRequestHeader("Accept-Encoding","gzip, deflate"),xhr.setRequestHeader("TE","gzip, deflate"),xhr.setRequestHeader("Connection","close"),xhr.setRequestHeader("Cache-Control","no-cache"),xhr.setRequestHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"),xhr.setRequestHeader("Upgrade-Insecure-Requests","1");try{xhr.send()}catch(e){}}function parseXML(e){var t;return 0===e.parseError.errorCode||(t=e.parseError,showError(["You have error ","reason: "+t.reason,"code: "+(t.errorCode>>>0).toString(16),"filepos: "+t.filepos,"line: "+t.line,"linepos: "+t.linepos,"srcText: "+t.srcText,"url: "+t.url].join("\n")),!1)}function getContent(e){var t,o,r,c,s,i;if(getHTML(e),200===xhr.status){for(document.open(),document.write(xhr.responseText),document.close(),t=document.getElementById("content"),c=t.firstChild;c&&(1!==c.nodeType||"Post"!==c.className);)c=c.nextSibling;if(c){for(i=c.firstChild;i&&"PostHead"!==i.className;)i=i.nextSibling;if(i){for(s=i.getElementsByTagName("h2"),s=s?s[0].innerText:"",t=c.firstChild;t&&"PostContent"!==t.className;)t=t.nextSibling;if(t){for(o=t.childNodes,r=o.length-1;r>=0;--r)(re1.test(o[r].className)||re2.test(o[r].nodeName))&&t.removeChild(o[r]);try{ts.WriteLine("A"+(s+"\r\n"+t.innerText).replace(re3,"\r\n").replace(re4,"")),tsCache.WriteLine(e)}catch(t){showError(t,e)}}}}}}var xhr=function(){for(var e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],t=0;t<e.length;t++)try{return WScript.CreateObject(e[t])}catch(e){}WScript.Quit(1)}(),xmlDoc,xmlDoc2,sitemap="https://www.lz13.cn/sitemap.xml",sitemapValue,document=new ActiveXObject("htmlfile"),fso=new ActiveXObject("Scripting.FileSystemObject"),ts,tsLog,tsCache,file="lizhi13.txt",oSelection,oSelection2,nodeLoc,re1=/left_box\d*|blank.*|pager/,re2=/ul|ol/i,re3=/[\r\n][\s\u3000]+/g,re4=/^[\s\u3000]+|\s+$/g,logfile=encodeURIComponent(sitemap)+".log",cachefile="$cache.log",oCache={};try{tsLog=fso.OpenTextFile(logfile,8,!0)}catch(e){WScript.Echo("can not write log file "+logfile),WScript.Quit(2)}try{tsCache=fso.OpenTextFile(cachefile,1,!0)}catch(e){WScript.Echo("can not read cache file "+cachefile),WScript.Quit(2)}for(;!tsCache.AtEndOfStream;)oCache[tsCache.ReadLine()]=1;tsCache.Close();try{tsCache=fso.OpenTextFile(cachefile,8,!0)}catch(e){WScript.Echo("can not write cache file "+cachefile),WScript.Quit(2)}try{ts=fso.OpenTextFile(file,8,!0,-1)}catch(e){showError(e,"can not write file "+file),WScript.Quit(3)}try{if(getHTML(sitemap),200===xhr.status){xmlDoc=xhr.responseXML,null==xmlDoc&&(showError("The content is not a XML.URL="+sitemap),WScript.Quit(4)),parseXML(xmlDoc)||WScript.Quit(5),xmlDoc.setProperty("SelectionLanguage","XPath"),xmlDoc.setProperty("SelectionNamespaces",""),oSelection=xmlDoc.selectNodes("sitemapindex/sitemap/loc/text()");for(var i=0,l=oSelection.length-1;i<l;++i)if(sitemapValue=oSelection.item(i).nodeValue,!oCache[sitemapValue])if(getHTML(sitemapValue),200===xhr.status){if(xmlDoc2=xhr.responseXML,null==xmlDoc2){showError("The Content is not a XML.URL="+sitemapValue);continue}if(!parseXML(xmlDoc2))continue;for(xmlDoc2.setProperty("SelectionLanguage","XPath"),xmlDoc2.setProperty("SelectionNamespaces",'xmlns:t="'+xmlDoc2.documentElement.getAttribute("xmlns")+'"'),oSelection2=xmlDoc2.selectNodes("t:urlset/t:url/t:loc/text()");nodeLoc=oSelection2.nextNode();)if(!oCache[nodeLoc.nodeValue])try{getContent(nodeLoc.nodeValue)}catch(e){}tsCache.WriteLine(sitemapValue)}else showError("status"+xhr.status+"\r\n"+xhr.getAllResponseHeaders())}else showError("status"+xhr.status+"\r\n"+xhr.getAllResponseHeaders())}catch(e){throw showError(e),e}finally{ts.Close(),tsLog.Close(),tsCache.Close()}WScript.Echo("Mission complete."),WScript.Quit(0);
复制代码

作者: 523066680    时间: 2018-10-6 08:09

本帖最后由 523066680 于 2018-10-6 08:11 编辑

回复 21# flashercs


    我还以为题主在后台和你们联系付款,你们才能孜孜不倦地跟进。。。

我试着扒了一个,感觉30元根本不划算,除非是自己做着玩儿

http://bbs.bathome.net/thread-50855-1-1.html

另外,17楼说的很对。
作者: lxh623    时间: 2018-10-6 08:13

回复 21# flashercs
站内短信,您没有回复。另一个您可以问一问。
这个脚本下载了1860k,就停了。
昨天,我自己把网页下载了,也是个办法。我也是觉得做成字典,可以自己看看,别人写作也可以参考,不是赚钱的。花一点点钱,玩一玩。当然,感谢大家帮助。




欢迎光临 批处理之家 (http://bbs.bathome.net/) Powered by Discuz! 7.2