标题: 【已解决】抓取两个网站的文章 [打印本页]
作者: lxh623 时间: 2018-9-28 15:31 标题: 【已解决】抓取两个网站的文章
本帖最后由 lxh623 于 2018-10-7 08:06 编辑
http://www.ceasm.com/
http://www.1juzi.com/juzidaquan/
两个都有八个一级栏目。二级栏目,第一个在下面“栏目导航”那里,第二个就在下面粉色的文字。
希望是进到二级栏目,抓取文章,有些文章有多页。文章标题加A 。正文在标题后面,每一段落为文本的一行。
比如,第一个网站第一个二级栏目的第一篇:(行首空格,我可以删除的。)
A不到不可怕,守不住才是个笑话
经典语录:不到不可怕,守不住才是个笑话
1、我看我自己看了20多年才看顺眼,你看我不顺眼很正常。我活着也不是为了取悦你。
2、我要的,只是简单而安稳的生活,最好的幸福,是你给的在乎。
一个网站 30元,谢谢大家!!
作者: B魔方大人 时间: 2018-9-28 15:45
联系:632858742
作者: flashercs 时间: 2018-9-29 18:41
本帖最后由 flashercs 于 2018-9-30 00:41 编辑
http://www.1juzi.com/- (function(){function n(n){var a,u,g;g=e(n);try{a=g.match(l)[0]}catch(n){a=""}for(;u=d.exec(a);)try{i(h+u[1])}catch(n){if("1"==n)continue;throw n}finally{}}function i(n){var i,o=e(n),r="";try{i=o.match(j)[1],r=o.match(x)[1].match(/<p>[\S\s]*?<\/p>/gi).join("").replace(y,a).replace(/·/g,"·").replace(/—/g,"—").replace(/“/g,"“").replace(/”/g,"”").replace(/…/g,"……").replace(/‘/g,"‘").replace(/’/g,"’")}catch(i){throw u(i,"url="+n),"1"}try{g.WriteLine("A"+i+r.replace(m,""))}catch(n){u(n,"Writing to file "+t+" failed.")}finally{}}function e(n){var i;return r.open("GET",n,!1),r.send(),200===r.status?(c.Type=1,c.Open(),c.Write(r.responseBody),c.Position=0,c.Type=2,c.Charset="gbk",i=c.ReadText(),c.Close(),i):(u('fetch URI "'+n+'" failed.\nstatus: '+r.status),"")}function a(n){return"</p>"===n.toLowerCase()?"\r\n":""}function u(n,i){WScript.Echo("[object Error]"===Object.prototype.toString.call(n)?[n.name,"source: "+(void 0===i?"":i),"number: "+(n.number>>>0).toString(16),"equipment: "+(n.number>>16&8191),"code: "+(65535&n.number),"Information: "+n.message].join("\n"):n)}var g,t,o,r=function(){for(var n=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],i=0;i<n.length;i++)try{return new ActiveXObject(n[i])}catch(n){}u("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),c=new ActiveXObject("ADODB.Stream"),s=new ActiveXObject("Scripting.FileSystemObject"),h="http://www.1juzi.com/",l=/<ul[^>]+class=["']alist["'][^>]*>[\S\s]*?<\/ul>/i,d=/<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>/gi,j=/<h1>([\S\s]+?)<\/h1>/i,x=/<div[^>]+class=["']content["'][^>]*>\s*<div[^>]+class="news"[^>]*>[\S\s]*?<\/div>([\S\s]+?)<\/div>/i,y=/<strong>[\S\s]*?<\/strong>|<[^>]*>|(?: |\n|\r)+/gi,m=/\n\s*(?=\n)/g,p={"唯美的句子":"/weimeidejuzi/","伤感的句子":"/shanggandejuzi/","幸福的句子":"/xingfudejuzi/","爱情的句子":"/aiqingdejuzi/","表白的句子":"/biaobaidejuzi/","励志的句子":"/lizhidejuzi/","正能量的句子":"/zhengnengliangdejuzi/","经典句子":"/jingdianjuzi/","好词好句":"/haocihaoju/","励志名言":"/lizhimingyan/","名人名言":"/mingrenmingyan/","名言警句":"/mingyanjingju/","读书名言":"/dushumingyan/","爱国名言":"/aiguomingyan/","英语名言":"/yingyumingyan/","经典名言":"/jingdianmingyan/","经典语录":"/jingdianyulu/","名人语录":"/mingrenyulu/","励志语录":"/lizhiyulu/","正能量语录":"/zhengnengliangyulu/","情感语录":"/qingganyulu/","心情语录":"/xinqingyulu/","爱情语录":"/aiqingyulu/","伤感语录":"/shangganyulu/","搞笑语录":"/gaoxiaoyulu/","人生格言":"/renshenggeyan/","励志格言":"/lizhigeyan/","爱情格言":"/aiqinggeyan/","经典格言":"/jingdiangeyan/","英语格言":"/yingyugeyan/","节日诗句":"/jierishiju/","写景诗句":"/xiejingshiju/","抒情诗句":"/shuqingshiju/","爱国诗句":"/aiguoshiju/","说说心情短语":"/shuoshuoxinqingduanyu/","伤感说说":"/shangganshuoshuo/","空间说说":"/kongjianshuoshuo/","经典说说":"/jingdianshuoshuo/","搞笑说说":"/gaoxiaoshuoshuo/","个性说说":"/gexingshuoshuo/","个性签名":"/gexingqianming/","QQ个性签名":"/qqgexingqianming/","伤感个性签名":"/shanggangexingqianming/","搞笑个性签名":"/gaoxiaogexingqianming/","励志个性签名":"/lizhigexingqianming/","女生个性签名":"/nvshenggexingqianming/","幸福个性签名":"/xingfugexingqianming/","情侣个性签名":"/qinglvgexingqianming/","超拽个性签名":"/chaozhuaigexingqianming/","春节短信":"/chunjieduanxin/","情人节短信":"/qingrenjieduanxin/","元宵节短信":"/yuanxiaojieduanxin/","生日短信":"/shengriduanxin/","结婚短信":"/jiehunduanxin/","妇女节短信":"/funvjieduanxin/","愚人节短信":"/yurenjieduanxin/","劳动节短信":"/laodongjieduanxin/","母亲节短信":"/muqinjieduanxin/","父亲节短信":"/fuqinjieduanxin/","端午节短信":"/duanwujieduanxin/","七夕节短信":"/qixijieduanxin/","中秋节短信":"/zhongqiujieduanxin/","感恩节短信":"/ganenjieduanxin/","圣诞节短信":"/shengdanjieduanxin/","儿童节短信":"/ertongjieduanxin/","经典短信":"/jingdianduanxin/","国庆节短信":"/guoqingjieduanxin/","教师节短信":"/jiaoshijieduanxin/"};new ActiveXObject("WScript.Shell").CurrentDirectory=s.GetParentFolderName(WScript.ScriptFullName),c.Mode=3;for(var q=0,f=arguments.length;q<f;++q)if(o=arguments[q],p[o]){t=o+".txt";try{g=s.OpenTextFile(t,2,!0)}catch(n){u(n,"Opening file "+t);continue}try{n(h+p[o])}catch(n){u(n)}g.Close()}else u("Cannot find "+o);WScript.Echo("Mission complete.")})("唯美的句子","伤感的句子","幸福的句子","爱情的句子","表白的句子","励志的句子","正能量的句子","经典句子","好词好句","励志名言","名人名言","名言警句","读书名言","爱国名言","英语名言","经典名言","经典语录","名人语录","励志语录","正能量语录","情感语录","心情语录","爱情语录","伤感语录","搞笑语录","人生格言","励志格言","爱情格言","经典格言","英语格言","节日诗句","写景诗句","抒情诗句","爱国诗句","说说心情短语","伤感说说","空间说说","经典说说","搞笑说说","个性说说","个性签名","QQ个性签名","伤感个性签名","搞笑个性签名","励志个性签名","女生个性签名","幸福个性签名","情侣个性签名","超拽个性签名","春节短信","情人节短信","元宵节短信","生日短信","结婚短信","妇女节短信","愚人节短信","劳动节短信","母亲节短信","父亲节短信","端午节短信","七夕节短信","中秋节短信","感恩节短信","圣诞节短信","儿童节短信","经典短信","国庆节短信","教师节短信");
复制代码
作者: flashercs 时间: 2018-9-29 19:34
本帖最后由 flashercs 于 2018-9-30 00:44 编辑
http://www.ceasm.com/- (function(){function e(e){for(var c,a,i,n,o=e.replace(/[^\/]+$/,"");;){n=r(e);try{c=n.match(g)[0]}catch(e){c=""}try{a=n.match(v)[0]}catch(e){a=""}for(;i=S.exec(c);)try{t(f+i[1])}catch(e){if("1"==e)continue;throw e}finally{}if(!(i=a.match(b)))break;e=o+i[1]}}function t(e){var t,i,s=r(e),l="";t=e.replace(/[^\/]+$/,"");try{for(i=s.match(y)[1];;){l+=s.match(M)[0].replace(T,c);try{s=r(t+s.match(v)[0].match(b)[1])}catch(e){break}}}catch(e){throw"1"}try{n.WriteLine("A"+i+"\r\n"+l.replace(/·/g,"·").replace(/—/g,"—").replace(/“/g,"“").replace(/”/g,"”").replace(/…/g,"……").replace(/‘/g,"‘").replace(/’/g,"’").replace(/文章转自.*$/,"").replace(X,""))}catch(e){a(e,"Writing to file "+o+" failed.")}finally{}}function r(e){var t;return h.open("GET",e,!1),h.send(),200===h.status?(p.Type=1,p.Open(),p.Write(h.responseBody),p.Position=0,p.Type=2,p.Charset="gbk",t=p.ReadText(),p.Close(),t):(a('fetch URI "'+e+'" failed.\nstatus: '+h.status),"")}function c(e){var t=e.toLowerCase();return"</div>"===t||"</p>"===t?"\r\n":""}function a(e,t){WScript.Echo("[object Error]"===Object.prototype.toString.call(e)?[e.name,"source: "+(void 0===t?"":t),"number: "+(e.number>>>0).toString(16),"equipment: "+(e.number>>16&8191),"code: "+(65535&e.number),"Information: "+e.message].join("\n"):e)}var i,n,o,s,l,h=function(){for(var e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],t=0;t<e.length;t++)try{return new ActiveXObject(e[t])}catch(e){}a("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),p=new ActiveXObject("ADODB.Stream"),u=new ActiveXObject("Scripting.FileSystemObject"),f="http://www.ceasm.com/",d=/<div[^>]+class=['"]keywords['"'[^>]*>[\S\s]+?<\/div>/i,m=/<a[^>]+href=["']([^"']+)["'][^>]*>([^<>]*)<\/a>/gi,g=/<ul[^>]+class=["']dedelist["'][^>]*>[\S\s]+?<\/ul>/i,v=/<div[^>]+class=["']dede_pages["'][^>]*>[\S\s]+?<\/div>/i,S=/<h4>\s*<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>\s*<\/h4>/gi,b=/<a[^>]+href=["']([^"']+)["'][^>]*>下一页<\/a>/i,y=/<h2>([\S\s]+?)<\/h2>/i,M=/<div[^>]+class=["']text["'][^>]*>\s*<table[^>]*>[\S\s]+?<\/table>/i,T=/<[^>]*>|(?:\s|\u3000| )+/gi,X=/\n\s*(?=\n)/g;new ActiveXObject("WScript.Shell").CurrentDirectory=u.GetParentFolderName(WScript.ScriptFullName),p.Mode=3;var w=r(f);i=w.match(/<div[^>]+class="menu"[^>]*>[\S\s]*?<\/div>/i),i||(a("No navMenu."),WScript.Quit(1)),s=i[0];for(var L=0,O=arguments.length;L<O;++L)if(i=s.match(RegExp('<a[^>]+href="([^"]+)"[^>]*>'+arguments[L],"i"))){o=arguments[L]+".txt";try{n=u.OpenTextFile(o,2,!0)}catch(e){a(e,"Opening file "+o);continue}for(l=d.exec(r(i[1]));i=m.exec(l);)try{e(f+i[1])}catch(e){a(e)}n.Close()}else a("Cannot find "+arguments[L]);WScript.Echo("Mission complete.")})("励志名言","名人名言","励志文章","人生感悟","经典语录","经典句子","散文精选","情感日志");
复制代码
作者: WHY 时间: 2018-9-29 21:50
本帖最后由 WHY 于 2018-10-3 12:17 编辑
修改第5行变量 i 的取值范围,这里 i 设置为从 1 到 1000,最大可以取值从 1 到 145000 多
比如: var i=1001; i<=3000; i++
可以下载 1001.html 到 3000.html 页面句子。- var fso = new ActiveXObject('Scripting.FileSystemObject');
- var http = new ActiveXObject('Microsoft.XMLHTTP');
- var home = 'http://www.1juzi.com/';
-
- for (var i=1; i<=1000; i++) {
- var url = home + 'new/' + i + '.html'
- var txt = getText(url);
- var m = txt.match(/<h1>((?!服务器错误).*?)<\/h1>/); //匹配句子标题
- if(!m) continue;
- var stc = [];
- var reg = /<p>(?!<\/p).*<\/p>/ig;
- while(arr = reg.exec(txt)) {
- var s = arr[0].replace(/<[^>]*>| /g, '');
- s = s.replace(/句子大全http:\/\/www\.1juzi\.com\//ig, '');
- s = s.replace(/‘/g, '‘').replace(/’/g, '’').replace(/…/g,'…').replace(/'/g, "'");
- s = s.replace(/·/g, '·').replace(/“/g, '“').replace(/”/g, '”');
- s = s.replace(/—/g, '—').replace(/"/g, '"');
- stc.push(s);
- }
- writeToFile('A' + m[1] + '\r\n' + stc.join('\r\n'));
- }
-
- function getText(url) {
- http.open('GET', url, false);
- http.send();
- with(new ActiveXObject('ADODB.Stream')){
- Mode = 3; Type = 1;
- Open();
- Write(http.responseBody);
- Position = 0;
- Type = 2;
- Charset = 'GB2312';
- var s = ReadText(-1);
- }
- return s;
- }
-
- function writeToFile(str) {
- var objFile = fso.OpenTextFile('1juzi.Log', 8, true);
- objFile.WriteLine(str);
- objFile.Close();
- }
-
- WSH.Echo('Done')
复制代码
作者: WHY 时间: 2018-9-29 21:59
本帖最后由 WHY 于 2018-9-30 19:19 编辑
- var fso = new ActiveXObject('Scripting.FileSystemObject');
- var http = new ActiveXObject('Microsoft.XMLHTTP');
-
- //var map = {'励志名言':'lizhimingyan','名人名言':'mingrenmingyan','励志文章':'lizhiwenzhang','人生感悟':'renshengganwu','经典语录':'jingdianyulu','经典句子':'jingdianjuzi','散文精选':'sanwenjingxuan','情感日志':'qingganrizhi'};
- var map = {'散文精选':'sanwenjingxuan'};
- var home = 'http://www.ceasm.com/';
-
- for (var key in map) {
- var url = home + map[key] + '/';
- var reg = /<h4><a href="\/([^"]*\.html)" target="_blank">([^<>]*)<\/a><\/h4>/g;
- while(url != '') {
- var txt = getText(url);
- var m = txt.match(/<a href='([^']*\.html)'>下一页/);
- url = m ? home + map[key] + m[1] : '';
- txt = txt.split('<ul class="dedelist">')[1]
- if(!txt) continue;
- txt = txt.split('</ul>')[0];
- while(arr = reg.exec(txt)) {
- writeToFile('A' + arr[2] + '\r\n' + getSentence(home + arr[1]));
- }
- }
- }
-
- function getText(url) {
- http.open('GET', url, false);
- http.send();
- with(new ActiveXObject('ADODB.Stream')){
- Mode = 3; Type = 1;
- Open();
- Write(http.responseBody);
- Position = 0;
- Type = 2;
- Charset = 'GB2312';
- var s = ReadText(-1);
- }
- return s;
- }
-
- function getSentence(url) {
- var stc = [];
- while(url != '') {
- var s = getText(url);
- var m = s.match(/<a href='([^']*\.html)'>下一页/);
- url = m ? url.replace(/[^/]*$/, '') + m[1] : '';
- s = s.split('<div class="text">')[1];
- if(!s) continue;
- s = s.split('</table>')[0];
- s = s.replace(/(?:<\/strong>)?<\/p>/g, '\r\n');
- s = s.replace(/<[^>]*>| /g, '');
- s = s.replace(/^[ ]+/gm, '');
- s = s.replace(/(\r?\n)+/g, '$1').replace(/^(?:\r?\n)|(?:\r?\n)$/g, '');
- s = s.replace(/'/g, "'");
- s = s.replace(/‘/g, '‘').replace(/’/g, '’').replace(/…/g,'…');
- s = s.replace(/·/g, '·').replace(/“/g, '“').replace(/”/g, '”');
- s = s.replace(/—/g, '—').replace(/"/g, '"');
- stc.push(s);
- }
- return stc.join('\r\n');
- }
-
- function writeToFile(str) {
- var objFile = fso.OpenTextFile(key + '.Log', 8, true);
- objFile.WriteLine(str);
- objFile.Close();
- }
-
- WSH.Echo('Done')
复制代码
作者: lxh623 时间: 2018-9-30 16:49
回复 6# WHY
麻烦帮我看一看。经典语录这里退出了。谢谢!
祝大家国庆节快乐!!
作者: WHY 时间: 2018-9-30 19:24
回复 7# lxh623
已修改。
有些网址打不开,继续即可。
作者: lxh623 时间: 2018-10-1 10:58
回复 8# WHY
谢谢!使用中有个想法,以后,是不是尽可能用网页字符代码。
作者: lxh623 时间: 2018-10-1 11:12
回复 5# WHY
求保持网页代码的办法。因为有乱码。
作者: 523066680 时间: 2018-10-1 13:29
在扒第一个网站的时候发现了一段不太一样的
http://www.ceasm.com/lizhiyulu/2814.html- 克日,萌宠举措悬疑笑剧影戏《营救汪星人》宣布了全新的“励志汪”版创意主题写真,写真图中汪汪们或在山顶瞻仰天空,或趴在路边思索人生,再配以切合意境的“励志心灵鸡汤”每只狗狗都有和本身相对应的励志语录。如:“不是每次主人城市给肉骨头,但每一次都值得本汪去实行”,站长之家 站长站 中国站长 站长网 seo查询 帮站seo 权重查询 网站权重查询 站长工具综合查询 百度排名查询 排名查询 iis7网站监控, 网站监控 免费网站监控 监控宝 域名劫持 域名被墙 dns污染 网站速度检测 网站速度测试 网站被黑 ,这种话也只有二哈能想出来;“若是本日糊口诱骗了你,不要哀痛,不要抽泣,你还可以诱骗你本身”等风趣应景的话语,让人捧腹大笑的同时不禁直呼列位看官快饮了这碗“狗狗心灵鸡汤”。
复制代码
作者: WHY 时间: 2018-10-1 14:07
回复 10# lxh623
我不明白“保持网页代码”是个什么鬼,
我这里测试没有“乱码”,请举例说明你的要求。
另外,5楼脚本中第15、16、17行是替换 html 转义字符的。如果“乱码”指这些转义字符,删掉15、16、17行。
作者: lxh623 时间: 2018-10-1 14:37
回复 12# WHY
http://www.1juzi.com/new/6228.html
还有签名大全里面有很多自创字符,EmEditor打开编辑后有乱码。
另一个兄弟是保存为同一编码的文本。
所以,我觉得,编码可能一样更好。UTF-8和GB,有些字符会变化。
谢谢!国庆节快乐!
作者: WHY 时间: 2018-10-1 15:42
回复 13# lxh623
这个网址我下载后用记事本打开没发现有乱码,Win10 v1803 系统。
作者: lxh623 时间: 2018-10-3 10:02
有些标题没有抓到,请再帮我看看。
比如,http://www.1juzi.com/new/122019.html
我试着抓了一千页,没有得到该页标题。
谢谢!
作者: WHY 时间: 2018-10-3 12:18
回复 15# lxh623
已修改。
作者: xczxczxcz 时间: 2018-10-3 14:53
30? 路边摆碗,日收入有时也好几百哦。
戏子台上摆个型。轻松录个广告。就是几十万,几百万个30啊。技术误国。
作者: lxh623 时间: 2018-10-3 16:34
本帖最后由 lxh623 于 2018-10-3 16:36 编辑
再来一个网站。
http://www.lz13.cn/
二级链接就是上面两行栏目的链接。以及最下面一行的“资料”及倒数第二行作文大全 诗词名句 读后感 观后感 读书笔记 好词好句 祝福语 经典台词 个性签名 教育教学 日志大全 等等。
暂时不知道网页还有更多的链接不。
要求与上面一样。标题加个A 。
谢谢!!
作者: lxh623 时间: 2018-10-4 07:55
回复 16# WHY
第25(或者15)行第五个字符。退出了。
大约http://www.1juzi.com/new/4099.html后面。
谢谢!
作者: WHY 时间: 2018-10-4 12:06
回复 19# lxh623
我这里没有问题。
第15行第5个字符是空白字符,不可能报错。第25行 send 报错,可能是网络或服务器自身的问题,你换个时间点再试。
作者: flashercs 时间: 2018-10-6 07:30
本帖最后由 flashercs 于 2018-10-6 10:07 编辑
- function showError(e,t){tsLog.WriteLine("[object Error]"===Object.prototype.toString.call(e)?[e.name,"source: "+(void 0===t?"":t),"number: "+(e.number>>>0).toString(16),"equipment: "+(e.number>>16&8191),"code: "+(65535&e.number),"Information: "+e.message].join("\r\n"):e)}function getHTML(e){xhr.open("GET",e,!1),xhr.setRequestHeader("Accept","text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8"),xhr.setRequestHeader("Accept-Language","en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3"),xhr.setRequestHeader("Accept-Encoding","gzip, deflate"),xhr.setRequestHeader("TE","gzip, deflate"),xhr.setRequestHeader("Connection","close"),xhr.setRequestHeader("Cache-Control","no-cache"),xhr.setRequestHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"),xhr.setRequestHeader("Upgrade-Insecure-Requests","1");try{xhr.send()}catch(e){}}function parseXML(e){var t;return 0===e.parseError.errorCode||(t=e.parseError,showError(["You have error ","reason: "+t.reason,"code: "+(t.errorCode>>>0).toString(16),"filepos: "+t.filepos,"line: "+t.line,"linepos: "+t.linepos,"srcText: "+t.srcText,"url: "+t.url].join("\n")),!1)}function getContent(e){var t,o,r,c,s,i;if(getHTML(e),200===xhr.status){for(document.open(),document.write(xhr.responseText),document.close(),t=document.getElementById("content"),c=t.firstChild;c&&(1!==c.nodeType||"Post"!==c.className);)c=c.nextSibling;if(c){for(i=c.firstChild;i&&"PostHead"!==i.className;)i=i.nextSibling;if(i){for(s=i.getElementsByTagName("h2"),s=s?s[0].innerText:"",t=c.firstChild;t&&"PostContent"!==t.className;)t=t.nextSibling;if(t){for(o=t.childNodes,r=o.length-1;r>=0;--r)(re1.test(o[r].className)||re2.test(o[r].nodeName))&&t.removeChild(o[r]);try{ts.WriteLine("A"+(s+"\r\n"+t.innerText).replace(re3,"\r\n").replace(re4,"")),tsCache.WriteLine(e)}catch(t){showError(t,e)}}}}}}var xhr=function(){for(var e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],t=0;t<e.length;t++)try{return WScript.CreateObject(e[t])}catch(e){}WScript.Quit(1)}(),xmlDoc,xmlDoc2,sitemap="https://www.lz13.cn/sitemap.xml",sitemapValue,document=new ActiveXObject("htmlfile"),fso=new ActiveXObject("Scripting.FileSystemObject"),ts,tsLog,tsCache,file="lizhi13.txt",oSelection,oSelection2,nodeLoc,re1=/left_box\d*|blank.*|pager/,re2=/ul|ol/i,re3=/[\r\n][\s\u3000]+/g,re4=/^[\s\u3000]+|\s+$/g,logfile=encodeURIComponent(sitemap)+".log",cachefile="$cache.log",oCache={};try{tsLog=fso.OpenTextFile(logfile,8,!0)}catch(e){WScript.Echo("can not write log file "+logfile),WScript.Quit(2)}try{tsCache=fso.OpenTextFile(cachefile,1,!0)}catch(e){WScript.Echo("can not read cache file "+cachefile),WScript.Quit(2)}for(;!tsCache.AtEndOfStream;)oCache[tsCache.ReadLine()]=1;tsCache.Close();try{tsCache=fso.OpenTextFile(cachefile,8,!0)}catch(e){WScript.Echo("can not write cache file "+cachefile),WScript.Quit(2)}try{ts=fso.OpenTextFile(file,8,!0,-1)}catch(e){showError(e,"can not write file "+file),WScript.Quit(3)}try{if(getHTML(sitemap),200===xhr.status){xmlDoc=xhr.responseXML,null==xmlDoc&&(showError("The content is not a XML.URL="+sitemap),WScript.Quit(4)),parseXML(xmlDoc)||WScript.Quit(5),xmlDoc.setProperty("SelectionLanguage","XPath"),xmlDoc.setProperty("SelectionNamespaces",""),oSelection=xmlDoc.selectNodes("sitemapindex/sitemap/loc/text()");for(var i=0,l=oSelection.length-1;i<l;++i)if(sitemapValue=oSelection.item(i).nodeValue,!oCache[sitemapValue])if(getHTML(sitemapValue),200===xhr.status){if(xmlDoc2=xhr.responseXML,null==xmlDoc2){showError("The Content is not a XML.URL="+sitemapValue);continue}if(!parseXML(xmlDoc2))continue;for(xmlDoc2.setProperty("SelectionLanguage","XPath"),xmlDoc2.setProperty("SelectionNamespaces",'xmlns:t="'+xmlDoc2.documentElement.getAttribute("xmlns")+'"'),oSelection2=xmlDoc2.selectNodes("t:urlset/t:url/t:loc/text()");nodeLoc=oSelection2.nextNode();)if(!oCache[nodeLoc.nodeValue])try{getContent(nodeLoc.nodeValue)}catch(e){}tsCache.WriteLine(sitemapValue)}else showError("status"+xhr.status+"\r\n"+xhr.getAllResponseHeaders())}else showError("status"+xhr.status+"\r\n"+xhr.getAllResponseHeaders())}catch(e){throw showError(e),e}finally{ts.Close(),tsLog.Close(),tsCache.Close()}WScript.Echo("Mission complete."),WScript.Quit(0);
复制代码
作者: 523066680 时间: 2018-10-6 08:09
本帖最后由 523066680 于 2018-10-6 08:11 编辑
回复 21# flashercs
我还以为题主在后台和你们联系付款,你们才能孜孜不倦地跟进。。。
我试着扒了一个,感觉30元根本不划算,除非是自己做着玩儿 。
http://bbs.bathome.net/thread-50855-1-1.html
另外,17楼说的很对。
作者: lxh623 时间: 2018-10-6 08:13
回复 21# flashercs
站内短信,您没有回复。另一个您可以问一问。
这个脚本下载了1860k,就停了。
昨天,我自己把网页下载了,也是个办法。我也是觉得做成字典,可以自己看看,别人写作也可以参考,不是赚钱的。花一点点钱,玩一玩。当然,感谢大家帮助。
欢迎光临 批处理之家 (http://bbs.bathome.net/) |
Powered by Discuz! 7.2 |