本帖最后由 went 于 2021-10-4 18:34 编辑
url保存为楼上格式,a.txt
#变量设置
$cur_file = 'pass.txt' #保存已读取的个数
$url_file = 'a.txt' #URL列表文件
$out_file = 'info.txt' #输出信息文件- #&cls&@cd /d "%~dp0" & powershell -c "Get-Content '%~0' | Select-Object -Skip 1 | Out-String | Invoke-Expression" &pause&exit
- cls
- #whr
- $whr = New-Object -ComObject 'WinHttp.WinHttpRequest.5.1'
-
- #函数 获取主要信息
- function Get-MainInfo($key){
- #主要信息界面
- $url = 'https://www.km.com/tv/{0}.html' -f $key
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
-
- #标题
- $pattern = '(?s)<div class="inside-title.*?>.*?>(.*?)<'
- if($txt -match $pattern){
- '名称: TTT' + $Matches[1]
- Write-Host ('解析电视剧: ' + $Matches[1])
- }
- #图片地址
- $pattern = '(?s)<div class="video_poster.*?">.*?<img src="(.*?)"'
- if($txt -match $pattern){
- '图片地址: https:' + $Matches[1]
- }
- #概览
- "`r`n【概览】"
- $pattern = '(?s)mr20">(.*?)<a class="unfold"'
- if($txt -match $pattern){
- $t = $Matches[1] -replace '(?s)<script>.*?</script>','' -replace '<.*?>','' -replace '\s{2,}',''
- $t -replace '(导演:)|(主演:)|(地区/类型:)|(剧情集数:)|(播出时间:)|(在线观看网站:)|(别名:)|(片长:)',("`r`n" + '$0') -replace "主演:.*?`r`n",''
- }
-
- #分数
- $pattern = '(?s)<p class="score_num.*?>.*?>(.*?)<'
- if($txt -match $pattern){
- '综合评分: ' + $Matches[1]
- }
- #获取剧情介绍内容和总集数
- $plot_main = ''
- $pattern = '剧情介绍:.*?>(.*?)<'
- if($txt -match $pattern){
- $plot_main = $Matches[1]
- }
- $pattern = '>\(全部 (\d+)<'
- if($txt -match $pattern){
- $part = $Matches[1]
- }
-
- #幕后信息
- "`r`n【幕后信息】"
- $pattern = '(?s)幕后信息</div>(.*?)<a class="intro_fold'
- if($txt -match $pattern){
- $Matches[1] -replace '<.*?>','' -replace '\s{2,}','' -replace '(编剧:)|(制片人:)|(TV首播时间:)|(在线播放平台:)|(出品公司:)|《',("`r`n" + '$0')
- }
-
- #演员表
- "`r`n【演员表】"
- $url = 'https://www.km.com/tv/yanyuan/{0}.html' -f $key
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- $pattern = '(?s)<div class="actor-list-detail same_col">.*?<div class="actor-list-detail same_col">'
- [regex]::Matches($txt,$pattern) | foreach {
- if($_.Value.Contains('<em>饰</em><em>演</em>')){
- 'ZZZ' + $_.Value -replace '<.*?>|\s{2,}',''-replace '饰演'," 饰演 " -replace '最近作品:.*$',''
- }
- }
-
- #人物介绍
- "`r`n【人物介绍】"
- $url = 'https://www.km.com/tv/role/{0}.html' -f $key
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- $pattern = '(?s)<div class="role-name">.*?<div class="role-intro-js">'
- [regex]::Matches($txt,$pattern) | foreach {
- $_.Value -replace '<.*?>','' -replace '\s{2,}','' -replace '演员',' 演员' -replace '-->简介:',"-->简介:`r`n`t"
- }
-
- #解析分集剧情
- Get-PartInfo -key $key -title $plot_main -max $part
- '-----------------------------------------------------------------------------'
- }
-
- #函数 解析分集剧情
- function Get-PartInfo($key,$title,$max){
- "`r`n【分集剧情】"
- "`t剧情介绍`n`t`t" + $title
- for($i = 1; $i -le $max; $i++){
- "`t第{0}集`t`t" -f $i
- $url = 'https://www.km.com/tv/{0}/2_{1}.html' -f $key,$i
- #读取网页内容
- $whr.Open('GET',$url,$false)
- $whr.Send()
- $txt = $whr.ResponseText
- $whr.Abort()
- #解析剧情内容
- $pattern = '(?s)<div class="article-content">.*?</div>'
- if($txt -match $pattern){
- "`t`t" + ($Matches[0] -replace '<.*?>','' -replace '\s{2,}',' ').Trim()
- }
- }
- }
-
- #变量设置
- $cur_file = 'pass.txt' #保存已读取的个数
- $url_file = 'a.txt' #URL列表文件
- $out_file = 'info.txt' #输出信息文件
- #读取已经解析的个数
- $cur = 0
- if([System.IO.File]::Exists($cur_file)){
- $line = Get-Content $cur_file | Select-Object -First 1
- if($line -match '\d+'){
- $cur = [int]$line
- }
- }
- Write-Host ('上次解析个数: {0}' -f $cur)
- #读取url并解析内容
- Get-Content $url_file | Select-Object -Skip $cur | foreach {
- if($_ -match 'https://www.km.com/tv/(\d+).html'){
- #解析内容
- Get-MainInfo -key $Matches[1] | Out-File $out_file -Append
- #保存当前进度
- ++$cur
- Out-File -InputObject $cur -FilePath $cur_file
- }
- }
复制代码
|