[视频教程]批处理基础视频教程[视频教程]VBS基础视频教程批处理在线视频分享
返回列表 发帖

【已解决】抓取两个网站的文章

本帖最后由 lxh623 于 2018-10-7 08:06 编辑

http://www.ceasm.com/
http://www.1juzi.com/juzidaquan/


两个都有八个一级栏目。二级栏目,第一个在下面“栏目导航”那里,第二个就在下面粉色的文字。
希望是进到二级栏目,抓取文章,有些文章有多页。文章标题加A 。正文在标题后面,每一段落为文本的一行。
比如,第一个网站第一个二级栏目的第一篇:(行首空格,我可以删除的。)

A不到不可怕,守不住才是个笑话
经典语录:不到不可怕,守不住才是个笑话
1、我看我自己看了20多年才看顺眼,你看我不顺眼很正常。我活着也不是为了取悦你。
2、我要的,只是简单而安稳的生活,最好的幸福,是你给的在乎。

一个网站 30元,谢谢大家!!

联系:632858742

TOP

本帖最后由 flashercs 于 2018-9-30 00:41 编辑

http://www.1juzi.com/
  1. (function(){function n(n){var a,u,g;g=e(n);try{a=g.match(l)[0]}catch(n){a=""}for(;u=d.exec(a);)try{i(h+u[1])}catch(n){if("1"==n)continue;throw n}finally{}}function i(n){var i,o=e(n),r="";try{i=o.match(j)[1],r=o.match(x)[1].match(/<p>[\S\s]*?<\/p>/gi).join("").replace(y,a).replace(/&middot;/g,"·").replace(/&mdash;/g,"—").replace(/&ldquo;/g,"“").replace(/&rdquo;/g,"”").replace(/&hellip;/g,"……").replace(/&lsquo;/g,"‘").replace(/&rsquo;/g,"’")}catch(i){throw u(i,"url="+n),"1"}try{g.WriteLine("A"+i+r.replace(m,""))}catch(n){u(n,"Writing to file "+t+" failed.")}finally{}}function e(n){var i;return r.open("GET",n,!1),r.send(),200===r.status?(c.Type=1,c.Open(),c.Write(r.responseBody),c.Position=0,c.Type=2,c.Charset="gbk",i=c.ReadText(),c.Close(),i):(u('fetch URI "'+n+'" failed.\nstatus: '+r.status),"")}function a(n){return"</p>"===n.toLowerCase()?"\r\n":""}function u(n,i){WScript.Echo("[object Error]"===Object.prototype.toString.call(n)?[n.name,"source: "+(void 0===i?"":i),"number: "+(n.number>>>0).toString(16),"equipment: "+(n.number>>16&8191),"code: "+(65535&n.number),"Information: "+n.message].join("\n"):n)}var g,t,o,r=function(){for(var n=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],i=0;i<n.length;i++)try{return new ActiveXObject(n[i])}catch(n){}u("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),c=new ActiveXObject("ADODB.Stream"),s=new ActiveXObject("Scripting.FileSystemObject"),h="http://www.1juzi.com/",l=/<ul[^>]+class=["']alist["'][^>]*>[\S\s]*?<\/ul>/i,d=/<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>/gi,j=/<h1>([\S\s]+?)<\/h1>/i,x=/<div[^>]+class=["']content["'][^>]*>\s*<div[^>]+class="news"[^>]*>[\S\s]*?<\/div>([\S\s]+?)<\/div>/i,y=/<strong>[\S\s]*?<\/strong>|<[^>]*>|(?:&nbsp;|\n|\r)+/gi,m=/\n\s*(?=\n)/g,p={"唯美的句子":"/weimeidejuzi/","伤感的句子":"/shanggandejuzi/","幸福的句子":"/xingfudejuzi/","爱情的句子":"/aiqingdejuzi/","表白的句子":"/biaobaidejuzi/","励志的句子":"/lizhidejuzi/","正能量的句子":"/zhengnengliangdejuzi/","经典句子":"/jingdianjuzi/","好词好句":"/haocihaoju/","励志名言":"/lizhimingyan/","名人名言":"/mingrenmingyan/","名言警句":"/mingyanjingju/","读书名言":"/dushumingyan/","爱国名言":"/aiguomingyan/","英语名言":"/yingyumingyan/","经典名言":"/jingdianmingyan/","经典语录":"/jingdianyulu/","名人语录":"/mingrenyulu/","励志语录":"/lizhiyulu/","正能量语录":"/zhengnengliangyulu/","情感语录":"/qingganyulu/","心情语录":"/xinqingyulu/","爱情语录":"/aiqingyulu/","伤感语录":"/shangganyulu/","搞笑语录":"/gaoxiaoyulu/","人生格言":"/renshenggeyan/","励志格言":"/lizhigeyan/","爱情格言":"/aiqinggeyan/","经典格言":"/jingdiangeyan/","英语格言":"/yingyugeyan/","节日诗句":"/jierishiju/","写景诗句":"/xiejingshiju/","抒情诗句":"/shuqingshiju/","爱国诗句":"/aiguoshiju/","说说心情短语":"/shuoshuoxinqingduanyu/","伤感说说":"/shangganshuoshuo/","空间说说":"/kongjianshuoshuo/","经典说说":"/jingdianshuoshuo/","搞笑说说":"/gaoxiaoshuoshuo/","个性说说":"/gexingshuoshuo/","个性签名":"/gexingqianming/","QQ个性签名":"/qqgexingqianming/","伤感个性签名":"/shanggangexingqianming/","搞笑个性签名":"/gaoxiaogexingqianming/","励志个性签名":"/lizhigexingqianming/","女生个性签名":"/nvshenggexingqianming/","幸福个性签名":"/xingfugexingqianming/","情侣个性签名":"/qinglvgexingqianming/","超拽个性签名":"/chaozhuaigexingqianming/","春节短信":"/chunjieduanxin/","情人节短信":"/qingrenjieduanxin/","元宵节短信":"/yuanxiaojieduanxin/","生日短信":"/shengriduanxin/","结婚短信":"/jiehunduanxin/","妇女节短信":"/funvjieduanxin/","愚人节短信":"/yurenjieduanxin/","劳动节短信":"/laodongjieduanxin/","母亲节短信":"/muqinjieduanxin/","父亲节短信":"/fuqinjieduanxin/","端午节短信":"/duanwujieduanxin/","七夕节短信":"/qixijieduanxin/","中秋节短信":"/zhongqiujieduanxin/","感恩节短信":"/ganenjieduanxin/","圣诞节短信":"/shengdanjieduanxin/","儿童节短信":"/ertongjieduanxin/","经典短信":"/jingdianduanxin/","国庆节短信":"/guoqingjieduanxin/","教师节短信":"/jiaoshijieduanxin/"};new ActiveXObject("WScript.Shell").CurrentDirectory=s.GetParentFolderName(WScript.ScriptFullName),c.Mode=3;for(var q=0,f=arguments.length;q<f;++q)if(o=arguments[q],p[o]){t=o+".txt";try{g=s.OpenTextFile(t,2,!0)}catch(n){u(n,"Opening file "+t);continue}try{n(h+p[o])}catch(n){u(n)}g.Close()}else u("Cannot find "+o);WScript.Echo("Mission complete.")})("唯美的句子","伤感的句子","幸福的句子","爱情的句子","表白的句子","励志的句子","正能量的句子","经典句子","好词好句","励志名言","名人名言","名言警句","读书名言","爱国名言","英语名言","经典名言","经典语录","名人语录","励志语录","正能量语录","情感语录","心情语录","爱情语录","伤感语录","搞笑语录","人生格言","励志格言","爱情格言","经典格言","英语格言","节日诗句","写景诗句","抒情诗句","爱国诗句","说说心情短语","伤感说说","空间说说","经典说说","搞笑说说","个性说说","个性签名","QQ个性签名","伤感个性签名","搞笑个性签名","励志个性签名","女生个性签名","幸福个性签名","情侣个性签名","超拽个性签名","春节短信","情人节短信","元宵节短信","生日短信","结婚短信","妇女节短信","愚人节短信","劳动节短信","母亲节短信","父亲节短信","端午节短信","七夕节短信","中秋节短信","感恩节短信","圣诞节短信","儿童节短信","经典短信","国庆节短信","教师节短信");
复制代码
1

评分人数

TOP

本帖最后由 flashercs 于 2018-9-30 00:44 编辑

http://www.ceasm.com/
  1. (function(){function e(e){for(var c,a,i,n,o=e.replace(/[^\/]+$/,"");;){n=r(e);try{c=n.match(g)[0]}catch(e){c=""}try{a=n.match(v)[0]}catch(e){a=""}for(;i=S.exec(c);)try{t(f+i[1])}catch(e){if("1"==e)continue;throw e}finally{}if(!(i=a.match(b)))break;e=o+i[1]}}function t(e){var t,i,s=r(e),l="";t=e.replace(/[^\/]+$/,"");try{for(i=s.match(y)[1];;){l+=s.match(M)[0].replace(T,c);try{s=r(t+s.match(v)[0].match(b)[1])}catch(e){break}}}catch(e){throw"1"}try{n.WriteLine("A"+i+"\r\n"+l.replace(/&middot;/g,"·").replace(/&mdash;/g,"—").replace(/&ldquo;/g,"“").replace(/&rdquo;/g,"”").replace(/&hellip;/g,"……").replace(/&lsquo;/g,"‘").replace(/&rsquo;/g,"’").replace(/文章转自.*$/,"").replace(X,""))}catch(e){a(e,"Writing to file "+o+" failed.")}finally{}}function r(e){var t;return h.open("GET",e,!1),h.send(),200===h.status?(p.Type=1,p.Open(),p.Write(h.responseBody),p.Position=0,p.Type=2,p.Charset="gbk",t=p.ReadText(),p.Close(),t):(a('fetch URI "'+e+'" failed.\nstatus: '+h.status),"")}function c(e){var t=e.toLowerCase();return"</div>"===t||"</p>"===t?"\r\n":""}function a(e,t){WScript.Echo("[object Error]"===Object.prototype.toString.call(e)?[e.name,"source: "+(void 0===t?"":t),"number: "+(e.number>>>0).toString(16),"equipment: "+(e.number>>16&8191),"code: "+(65535&e.number),"Information: "+e.message].join("\n"):e)}var i,n,o,s,l,h=function(){for(var e=["MSXML2.XMLHTTP.6.0","MSXML2.XMLHTTP.3.0","MSXML2.XMLHTTP","Microsoft.XMLHTTP"],t=0;t<e.length;t++)try{return new ActiveXObject(e[t])}catch(e){}a("Can't build XMLHTTP automation object."),WScript.Quit(1)}(),p=new ActiveXObject("ADODB.Stream"),u=new ActiveXObject("Scripting.FileSystemObject"),f="http://www.ceasm.com/",d=/<div[^>]+class=['"]keywords['"'[^>]*>[\S\s]+?<\/div>/i,m=/<a[^>]+href=["']([^"']+)["'][^>]*>([^<>]*)<\/a>/gi,g=/<ul[^>]+class=["']dedelist["'][^>]*>[\S\s]+?<\/ul>/i,v=/<div[^>]+class=["']dede_pages["'][^>]*>[\S\s]+?<\/div>/i,S=/<h4>\s*<a[^>]+href=["']([^"']+)["'][^>]*>[^<>]*<\/a>\s*<\/h4>/gi,b=/<a[^>]+href=["']([^"']+)["'][^>]*>下一页<\/a>/i,y=/<h2>([\S\s]+?)<\/h2>/i,M=/<div[^>]+class=["']text["'][^>]*>\s*<table[^>]*>[\S\s]+?<\/table>/i,T=/<[^>]*>|(?:\s|\u3000|&nbsp;)+/gi,X=/\n\s*(?=\n)/g;new ActiveXObject("WScript.Shell").CurrentDirectory=u.GetParentFolderName(WScript.ScriptFullName),p.Mode=3;var w=r(f);i=w.match(/<div[^>]+class="menu"[^>]*>[\S\s]*?<\/div>/i),i||(a("No navMenu."),WScript.Quit(1)),s=i[0];for(var L=0,O=arguments.length;L<O;++L)if(i=s.match(RegExp('<a[^>]+href="([^"]+)"[^>]*>'+arguments[L],"i"))){o=arguments[L]+".txt";try{n=u.OpenTextFile(o,2,!0)}catch(e){a(e,"Opening file "+o);continue}for(l=d.exec(r(i[1]));i=m.exec(l);)try{e(f+i[1])}catch(e){a(e)}n.Close()}else a("Cannot find "+arguments[L]);WScript.Echo("Mission complete.")})("励志名言","名人名言","励志文章","人生感悟","经典语录","经典句子","散文精选","情感日志");
复制代码
1

评分人数

TOP

本帖最后由 WHY 于 2018-10-3 12:17 编辑

修改第5行变量 i 的取值范围,这里 i 设置为从 1 到 1000,最大可以取值从 1 到 145000 多
比如: var i=1001; i<=3000; i++
可以下载 1001.html 到 3000.html 页面句子。
  1. var fso = new ActiveXObject('Scripting.FileSystemObject');
  2. var http = new ActiveXObject('Microsoft.XMLHTTP');
  3. var home = 'http://www.1juzi.com/';
  4. for (var i=1; i<=1000; i++) {
  5.     var url = home + 'new/' + i + '.html'
  6.     var txt = getText(url);
  7.     var m = txt.match(/<h1>((?!服务器错误).*?)<\/h1>/);    //匹配句子标题
  8.     if(!m) continue;
  9.     var stc = [];
  10.     var reg = /<p>(?!<\/p).*<\/p>/ig;
  11.     while(arr = reg.exec(txt)) {
  12.         var s = arr[0].replace(/<[^>]*>|&nbsp;/g, '');
  13.         s = s.replace(/句子大全http:\/\/www\.1juzi\.com\//ig, '');
  14.         s = s.replace(/&lsquo;/g, '‘').replace(/&rsquo;/g, '’').replace(/&hellip;/g,'…').replace(/&#39;/g, "'");
  15.         s = s.replace(/&middot;/g, '·').replace(/&ldquo;/g, '“').replace(/&rdquo;/g, '”');
  16.         s = s.replace(/&mdash;/g, '—').replace(/&quot;/g, '"');
  17.         stc.push(s);
  18.     }
  19.     writeToFile('A' + m[1] + '\r\n' + stc.join('\r\n'));
  20. }
  21. function getText(url) {
  22.     http.open('GET', url, false);
  23.     http.send();
  24.     with(new ActiveXObject('ADODB.Stream')){
  25.         Mode = 3; Type = 1;
  26.         Open();
  27.         Write(http.responseBody);
  28.         Position = 0;
  29.         Type = 2;
  30.         Charset = 'GB2312';
  31.         var s = ReadText(-1);
  32.     }
  33.     return s;
  34. }
  35. function writeToFile(str) {
  36.     var objFile = fso.OpenTextFile('1juzi.Log', 8, true);
  37.     objFile.WriteLine(str);
  38.     objFile.Close();
  39. }
  40. WSH.Echo('Done')
复制代码
1

评分人数

TOP

本帖最后由 WHY 于 2018-9-30 19:19 编辑
  1. var fso = new ActiveXObject('Scripting.FileSystemObject');
  2. var http = new ActiveXObject('Microsoft.XMLHTTP');
  3. //var map = {'励志名言':'lizhimingyan','名人名言':'mingrenmingyan','励志文章':'lizhiwenzhang','人生感悟':'renshengganwu','经典语录':'jingdianyulu','经典句子':'jingdianjuzi','散文精选':'sanwenjingxuan','情感日志':'qingganrizhi'};
  4. var map = {'散文精选':'sanwenjingxuan'};
  5. var home = 'http://www.ceasm.com/';
  6. for (var key in map) {
  7.     var url = home + map[key] + '/';
  8.     var reg = /<h4><a href="\/([^"]*\.html)" target="_blank">([^<>]*)<\/a><\/h4>/g;
  9.     while(url != '') {
  10.         var txt = getText(url);
  11.         var m = txt.match(/<a href='([^']*\.html)'>下一页/);
  12.         url = m ? home + map[key] + m[1] : '';
  13.         txt = txt.split('<ul class="dedelist">')[1]
  14.         if(!txt) continue;
  15.         txt = txt.split('</ul>')[0];
  16.         while(arr = reg.exec(txt)) {
  17.             writeToFile('A' + arr[2] + '\r\n' + getSentence(home + arr[1]));
  18.         }
  19.     }
  20. }
  21. function getText(url) {
  22.     http.open('GET', url, false);
  23.     http.send();
  24.     with(new ActiveXObject('ADODB.Stream')){
  25.         Mode = 3; Type = 1;
  26.         Open();
  27.         Write(http.responseBody);
  28.         Position = 0;
  29.         Type = 2;
  30.         Charset = 'GB2312';
  31.         var s = ReadText(-1);
  32.     }
  33.     return s;
  34. }
  35. function getSentence(url) {
  36.     var stc = [];
  37.     while(url != '') {
  38.         var s = getText(url);
  39.         var m = s.match(/<a href='([^']*\.html)'>下一页/);
  40.         url = m ? url.replace(/[^/]*$/, '') + m[1] : '';
  41.         s = s.split('<div class="text">')[1];
  42.         if(!s) continue;
  43.         s = s.split('</table>')[0];
  44.         s = s.replace(/(?:<\/strong>)?<\/p>/g, '\r\n');
  45.         s = s.replace(/<[^>]*>|&nbsp;/g, '');
  46.         s = s.replace(/^[   ]+/gm, '');
  47.         s = s.replace(/(\r?\n)+/g, '$1').replace(/^(?:\r?\n)|(?:\r?\n)$/g, '');
  48.         s = s.replace(/&#39;/g, "'");
  49.         s = s.replace(/&lsquo;/g, '‘').replace(/&rsquo;/g, '’').replace(/&hellip;/g,'…');        
  50.         s = s.replace(/&middot;/g, '·').replace(/&ldquo;/g, '“').replace(/&rdquo;/g, '”');
  51.         s = s.replace(/&mdash;/g, '—').replace(/&quot;/g, '"');
  52.         stc.push(s);
  53.     }
  54.     return stc.join('\r\n');
  55. }
  56. function writeToFile(str) {
  57.     var objFile = fso.OpenTextFile(key + '.Log', 8, true);
  58.     objFile.WriteLine(str);
  59.     objFile.Close();
  60. }
  61. WSH.Echo('Done')
复制代码
1

评分人数

TOP

回复 6# WHY
麻烦帮我看一看。经典语录这里退出了。谢谢!
祝大家国庆节快乐!!

TOP

回复 7# lxh623

已修改。
有些网址打不开,继续即可。

TOP

回复 8# WHY
谢谢!使用中有个想法,以后,是不是尽可能用网页字符代码。

TOP

回复 5# WHY
求保持网页代码的办法。因为有乱码。

TOP

在扒第一个网站的时候发现了一段不太一样的
http://www.ceasm.com/lizhiyulu/2814.html
  1. 克日,萌宠举措悬疑笑剧影戏《营救汪星人》宣布了全新的“励志汪”版创意主题写真,写真图中汪汪们或在山顶瞻仰天空,或趴在路边思索人生,再配以切合意境的“励志心灵鸡汤”每只狗狗都有和本身相对应的励志语录。如:“不是每次主人城市给肉骨头,但每一次都值得本汪去实行”,站长之家 站长站 中国站长 站长网 seo查询 帮站seo 权重查询 网站权重查询 站长工具综合查询 百度排名查询 排名查询 iis7网站监控, 网站监控 免费网站监控 监控宝 域名劫持 域名被墙 dns污染 网站速度检测 网站速度测试 网站被黑 ,这种话也只有二哈能想出来;“若是本日糊口诱骗了你,不要哀痛,不要抽泣,你还可以诱骗你本身”等风趣应景的话语,让人捧腹大笑的同时不禁直呼列位看官快饮了这碗“狗狗心灵鸡汤”。
复制代码
综合型编程论坛
Writing Code That Nobody Else Can Read.

TOP

回复 10# lxh623


    我不明白“保持网页代码”是个什么鬼,
我这里测试没有“乱码”,请举例说明你的要求。

另外,5楼脚本中第15、16、17行是替换 html 转义字符的。如果“乱码”指这些转义字符,删掉15、16、17行。

TOP

回复 12# WHY
http://www.1juzi.com/new/6228.html
还有签名大全里面有很多自创字符,EmEditor打开编辑后有乱码。

另一个兄弟是保存为同一编码的文本。

所以,我觉得,编码可能一样更好。UTF-8和GB,有些字符会变化。
谢谢!国庆节快乐!

TOP

回复 13# lxh623


    这个网址我下载后用记事本打开没发现有乱码,Win10 v1803 系统。

TOP

有些标题没有抓到,请再帮我看看。
比如,http://www.1juzi.com/new/122019.html
我试着抓了一千页,没有得到该页标题。
谢谢!

TOP

返回列表