本帖最后由 zaqmlp 于 2021-10-11 16:21 编辑
- <# :
- rem www.tvmao.com
- rem 另存为ANSI编码的bat
- cls&echo off&cd /d "%~dp0"
- powershell -NoProfile -ExecutionPolicy bypass "[IO.File]::ReadAllText('%~f0',[Text.Encoding]::GetEncoding('GB2312'))|Invoke-Expression"
- pause
- exit
- #>
-
- $starturl=1;
- $minilen=160;
- $timeout=900;
-
- $infile='.\文本b.txt';
- $outfile='.\结果.txt';
-
- if(-not (test-path -liter $infile)){write-host ('"'+$infile+'" 未找到');exit;};
-
- $enc=[System.Text.Encoding]::UTF8;
- $webclient=New-Object System.Net.WebClient;
- function gethtml($u){
- $t='';
- for($j=1;$j -le 3;$j++){
- try{
- $webclient.Headers.Add('User-Agent','Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36');
- $webclient.Encoding=$enc;
- $t=$webclient.DownloadString($u);
- break;
- }catch{
- write-host ('第'+$j.toString()+'次获取网页源码失败');
- start-sleep -Seconds 3;
- };
- };
- return $t;
- };
-
- function getcontent($u){
- $content='';
- $html2=gethtml $u;
- $m3=[regex]::match($html2,'<article class="clear epi_c"[^>]*?>([\s\S]+?)</article>');
- if($m3.Success){
- $content=((($m3.groups[1].value -replace '<p>','') -replace ' ',' ') -replace '</p>',"`r`n") -replace '<div(?: [^>]*?)?>[\s\S]+?</div>','';
- $content=(($content -replace '<[^>]+?>','') -replace '</?[a-z]+','').trim();
- }
- return $content;
- }
-
- $fs=New-Object System.IO.FileStream($outfile, [System.IO.FileMode]::Append);
- $sw=New-Object System.IO.StreamWriter($fs, $enc);
- $text=[IO.File]::ReadAllText($infile, $enc).split("`r`n",[StringSplitOptions]::RemoveEmptyEntries);
- for($i=0;$i -lt $text.Count;$i++){
- if(($i+1) -ge $starturl){
- write-host ('--------------'+$text[$i]+'--------------');
- $result=New-Object -TypeName System.Collections.ArrayList;
- $url=$text[$i].trimend(' /')
- $url1=$url+'/episode';
- $html1=gethtml $url1;
- $title='TTT';
- $m1=[regex]::match($html1,'title="([^"]+?)剧情介绍".*?>介绍');
- if($m1.Success){$title=$title+$m1.groups[1].value.replace('剧情介绍','');}
- write-host $title;
- [void]$result.add($title);
- $m1=[regex]::match($html1,'<div class="epipage clear">([\s\S]+?)</div>');
- if($m1.Success){
- $m2=[regex]::matches($m1.groups[1].value, '<li(?: [^>]*?)?>.*?href="([^"]+?)"[^>]*?>(\d+)</a></li>');
- for($j=0;$j -lt $m2.Count;$j++){
- [void]$result.add('第'+$m2[$j].groups[2].value+'集');
- $url2=($url -replace '^(https?://[^/]+).+','$1')+$m2[$j].groups[1].value;
- write-host ($m2[$j].groups[2].value+' '+$url2);
- $t=0;
- while(1){
- $ct=getcontent $url2;
- write-host $ct.length;
- if($ct.length -gt $minilen){
- [void]$result.add($ct);
- break;
- }else{
- $t++;
- }
- if($t -ge 3){
- $t=0;
- write-host ('字符数少于'+$minilen+',暂停'+$timeout+'秒……');
- start-sleep -Seconds $timeout;
- }
- }
- }
- }
- $s=$result -join "`r`n";
- $sw.WriteLine($s);
- $sw.WriteLine('');
- $sw.Flush();
- }
- }
- $sw.Close();
- $fs.Close();
复制代码
|