52shici还有两个内容 works 和 posts 开始搞错了都......也不知要哪个?都弄上吧.
52shiciWorks.bat- 0<1/*,:
- @echo off
- CScript.exe -nologo -e:jscript %0
- pause
- exit /b
- */;
- (function () {
- var xhr = (function () {
- var aXMLHttpVers = ['MSXML2.XMLHTTP.6.0', 'MSXML2.XMLHTTP.3.0', 'MSXML2.XMLHTTP', 'Microsoft.XMLHTTP'];
- for (var i = 0; i < aXMLHttpVers.length; i++) {
- try {
- return new ActiveXObject(aXMLHttpVers[i]);
- } catch (error) { }
- }
- showError('Can\'t build XMLHTTP automation object.');
- WScript.Quit(1);
- })(),
- fso = new ActiveXObject('Scripting.FileSystemObject'),
- htmldoc = new ActiveXObject('htmlfile'),
- domain = "http://www.52shici.com",
- outFile = "52shici_works.txt",
- cacheFile = "cache_52shici_works.txt",
- tsOut,
- tsCache,
- oCache = {},
- url,
- reCrLfS = /\r?\n */g,
- i,
- l,
- aIndex = [];
- new ActiveXObject('WScript.Shell').CurrentDirectory = fso.GetParentFolderName(WScript.ScriptFullName);
- try {
- tsCache = fso.OpenTextFile(cacheFile, 1, true);
- } catch (e) {
- showError('can not read cache file ' + cacheFile);
- WScript.Quit(1);
- }
- while (!tsCache.AtEndOfStream) {
- oCache[tsCache.ReadLine()] = true;
- }
- tsCache.Close();
- try {
- tsCache = fso.OpenTextFile(cacheFile, 8, true);
- } catch (e) {
- showError(e, 'can not write cache file ' + cacheFile);
- WScript.Quit(2);
- }
- try {
- tsOut = fso.OpenTextFile(outFile, 8, true, -1);//unicode encoding
- } catch (e) {
- showError(e, 'can not write file ' + outFile);
- WScript.Quit(3);
- }
- // main
- for (var type = 0; type <= 19; ++type) {
- var url = 'http://www.52shici.com/original.php?type=' + type + '&page=';
- var totalPages;
- if (!getHTMLindex(url + 1, true)) continue;
- var body = htmldoc.body;
- totalPgs = body.lastChild.innerText.match(/当前\d+\/(\d+)页/)[1];
- WSH.Echo('totalpages=' + totalPgs);
- for (var m = 1; m <= totalPgs; ++m) {
- if (oCache[url + m] || !getHTMLindex(url + m)) continue;
- body = htmldoc.body;
- var aList = body.getElementsByTagName('a');
- aIndex.length = 0;
- for (i = 0, l = aList.length; i < l; ++i) {
- aIndex.push(parseURL(aList[i].getAttribute('href'), url));
- }
- for (i = 0, l = aIndex.length; i < l; ++i) {
- getContent(aIndex[i]);
- }
- tsCache.WriteLine(url + m);
- }
-
- }
- tsOut.close();
- tsCache.close();
- WScript.Quit();
- function getContent(url) {
- if (oCache[url] || !getHTML(url)) return;
- var main = htmldoc.body.children[0];
- var nodes = main.children;
- try {
- for (var i = 0, l = nodes.length; i < l; ++i) {
- var item = nodes[i];
- switch (item.className) {
- case 'works-h1':
- var title = 'TTT ' + item.children[0].innerText;
- // WSH.Echo( 'title: '+title);
- var author = item.children[1].innerText.replace('文/', '作者:');
- // WSH.Echo( 'author: '+author);
- break;
- case 'works-content':
- var content = item.innerText;
- i = l;
- break;
- default:
- break;
- }
- }
- tsOut.WriteLine((title + '\r\n' + author + '\r\n' + content).replace(reCrLfS, '<br/>\r\n'));
- tsCache.WriteLine(url);
- } catch (e) {
- showError(e);
- }
-
- }
-
- function showError(err, source) {
- WScript.StdOut.WriteLine('[object Error]' === Object.prototype.toString.call(err) ?
- [
- err.name,
- 'source: ' + (undefined === source ? '' : source),
- 'number: ' + (err.number >>> 0).toString(16),
- 'Information: ' + err.message
- ].join('\r\n')
- :
- err);
- }
- function getHTMLindex(url, boltotalPages) {
- WScript.StdOut.Write('fetching ' + url + '...')
- xhr.open('GET', url, false);
- xhr.setRequestHeader('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8');
- xhr.setRequestHeader('Accept-Language', 'en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3');
- xhr.setRequestHeader('host', 'www.52shici.com/');
- xhr.setRequestHeader('Connection', 'close');
- xhr.setRequestHeader('Cache-Control', 'no-cache');
- xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134');
- try {
- xhr.send();
- // WSH.Echo(xhr.responseText.match(/<span[^>]*class=['"]mt['"][^>]*>[^<]*<\/span>/i)[0]);
- // WSH.Quit();
- if (200 === xhr.status) {
- htmldoc.open();
- htmldoc.write('<!DOCTYPE html><html><head></head><body>' + xhr.responseText.match(/<ul[^>]*id="listWorks"[^>]*>[\s\S]*?<\/ul>/i)[0] + (boltotalPages ? xhr.responseText.match(/<span[^>]*class=['"]mt['"][^>]*>[^<]*<\/span>/i)[0] : '') + '</body></html>');
- } else {
- WScript.StdOut.WriteLine('failed. status:' + xhr.status);
- return false
- }
- } catch (e) {
- WScript.StdOut.WriteLine('failed');
- return false;
- } finally {
- htmldoc.close();
- }
- WScript.StdOut.WriteLine('success');
- return true;
- }
- function getHTML(url) {
- WScript.StdOut.Write('fetching ' + url + '...')
- xhr.open('GET', url, false);
- xhr.setRequestHeader('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8');
- xhr.setRequestHeader('Accept-Language', 'en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3');
- xhr.setRequestHeader('host', 'www.52shici.com/');
- xhr.setRequestHeader('Connection', 'close');
- xhr.setRequestHeader('Cache-Control', 'no-cache');
- xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134');
- try {
- xhr.send();
- if (200 === xhr.status) {
- htmldoc.open();
- var aTxt = xhr.responseText.match(/<div class="main"[\s\S]+(?=<!--main end -->)/i);
- htmldoc.write('<!DOCTYPE html><html><head></head><body>' + aTxt[0] + '</body></html>');
- } else {
- WScript.StdOut.WriteLine('failed. status:' + xhr.status);
- return false
- }
- } catch (e) {
- WScript.StdOut.WriteLine('failed');
- return false;
- } finally {
- htmldoc.close();
- }
- WScript.StdOut.WriteLine('success');
- return true;
- }
- function parseURL(href, url) {
- var $path = href.replace(/^[^:]*:/, '');
- if (/^\/\/.*/.test($path)) {
- return href;
- } else if (/^\/.*/.test($path)) {
- return domain + $path;
- } else {
- return url.replace(/[^\/]+$/, '') + $path;
- }
- }
- })();
复制代码 52shiciPosts.bat- 0<1/*,:
- @echo off
- CScript.exe -nologo -e:jscript %0
- pause
- exit /b
- */;
- (function () {
- var xhr = (function () {
- var aXMLHttpVers = ['MSXML2.XMLHTTP.6.0', 'MSXML2.XMLHTTP.3.0', 'MSXML2.XMLHTTP', 'Microsoft.XMLHTTP'];
- for (var i = 0; i < aXMLHttpVers.length; i++) {
- try {
- return new ActiveXObject(aXMLHttpVers[i]);
- } catch (error) { }
- }
- showError('Can\'t build XMLHTTP automation object.');
- WScript.Quit(1);
- })(),
- fso = new ActiveXObject('Scripting.FileSystemObject'),
- htmldoc = new ActiveXObject('htmlfile'),
- domain = "http://www.52shici.com",
- outFile = "52shici_posts.txt",
- cacheFile = "cache_52shici_posts.txt",
- tsOut,
- tsCache,
- oCache = {},
- url,
- reCrLfS = /\r?\n */g,
- i,
- l = 300000,
- strOut,
- nodeContent,
- sTitle,
- sAuthor,
- sContent;
- new ActiveXObject('WScript.Shell').CurrentDirectory = fso.GetParentFolderName(WScript.ScriptFullName);
- try {
- tsCache = fso.OpenTextFile(cacheFile, 1, true);
- } catch (e) {
- showError('can not read cache file ' + cacheFile);
- WScript.Quit(1);
- }
- while (!tsCache.AtEndOfStream) {
- oCache[tsCache.ReadLine()] = true;
- }
- tsCache.Close();
- try {
- tsCache = fso.OpenTextFile(cacheFile, 8, true);
- } catch (e) {
- showError(e, 'can not write cache file ' + cacheFile);
- WScript.Quit(2);
- }
- try {
- tsOut = fso.OpenTextFile(outFile, 8, true, -1);//unicode encoding
- } catch (e) {
- showError(e, 'can not write file ' + outFile);
- WScript.Quit(3);
- }
- // main
- for (i = 1; i < l; ++i) {
- if (oCache['' + i]) continue;
- url = 'http://www.52shici.com/posts.php?id=' + i;
- if (!getHTML(url)) continue;
- var node = htmldoc.documentElement.childNodes[1].childNodes[0];
- try {
- for (var m = 0, n = node.childNodes.length; m < n; ++m) {
- var item = node.childNodes[m];
- switch (item.className) {
- case 'posts-h1':
- sTitle = 'TTT ' + item.innerText;
- break;
- case 'posts-h2':
- sAuthor = '作者:' + item.firstChild.innerText;
- break;
- case 'posts-content':
- item.removeChild(item.lastChild);
- sContent = item.innerText;
- break;
- default:
- break;
- }
- }
- strOut = sTitle + '\r\n' + sAuthor + '\r\n' + sContent;
- tsOut.WriteLine(strOut.replace(reCrLfS, '<br/>\r\n'));
- tsCache.WriteLine(i);
- } catch (e) {
- continue;
- }
- }
- tsOut.close();
- tsCache.close();
- WScript.Quit();
-
- function showError(err, source) {
- WScript.StdOut.WriteLine('[object Error]' === Object.prototype.toString.call(err) ?
- [
- err.name,
- 'source: ' + (undefined === source ? '' : source),
- 'number: ' + (err.number >>> 0).toString(16),
- 'Information: ' + err.message
- ].join('\r\n')
- :
- err);
- }
- function getHTML(url) {
- WScript.StdOut.Write('fetching ' + url + '...')
- xhr.open('GET', url, false);
- xhr.setRequestHeader('Accept', 'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8');
- xhr.setRequestHeader('Accept-Language', 'en-US, en; q=0.8, zh-Hans-CN; q=0.5, zh-Hans; q=0.3');
- xhr.setRequestHeader('host', 'www.52shici.com/');
- xhr.setRequestHeader('Connection', 'close');
- xhr.setRequestHeader('Cache-Control', 'no-cache');
- xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134');
- try {
- xhr.send();
- if (200 === xhr.status) {
- htmldoc.open();
- var aTxt = xhr.responseText.match(/<div class="sidebar"[\s\S]+(?=<div class="posts-do")/i);
- if (!aTxt) throw false;
- htmldoc.write('<!DOCTYPE html><html><head><meta http-equiv="Content-Type" content="text/html; charset=GB2312" /></head><body>' + aTxt[0] + '</div></body></html>');
- } else {
- WScript.StdOut.WriteLine('failed. status:' + xhr.status);
- return false
- }
- } catch (e) {
- WScript.StdOut.WriteLine('failed');
- return false;
- } finally {
- htmldoc.close();
- }
- WScript.StdOut.WriteLine('success');
- return true;
- }
- })();
复制代码
|