本帖最后由 flashercs 于 2018-12-18 06:36 编辑
回复 70# lxh623
你表面是提取txt文本,实际上是用来当作html使用,对吧?这样每个子文件夹提取为一个txt文件,其实是个没有<head><body>的html文本,例如提取出的"2018-12-17.txt",将其重命名为"2018-12-17.txt.html"后用浏览器打开,就是个完整的html文档,浏览器自动添加<html><head><body>;我猜这应该是你的需求!
请保存为 "提取zgshige.bat"- @echo off
- REM 设置htmlRoot = zgshige的html文件根目录
- set "htmlRoot=M:\zgshige"
- for /f "tokens=1 delims=:" %%A in ('findstr /n "#######*" %0') do more +%%A %0 >"%~dpn0.ps1"
- powershell.exe -ExecutionPolicy Bypass -File "%~dpn0.ps1" "%htmlRoot%"
- pause
- exit /b
- ################################################################
- # 保留html node
- param([string]$htmlRoot)
- [string]$scriptPath = [System.IO.Path]::GetDirectoryName($MyInvocation.MyCommand.Definition)
- $htmldoc = New-Object -ComObject htmlfile
- [void]$htmldoc.IHTMLDocument2_open()
- $htmldoc.IHTMLDocument2_write('<!DOCTYPE html><html><head><meta charset="utf-8" /><meta http-equiv="X-UA-Compatible" content="IE=edge"><title>Page Title</title><meta name="viewport" content="width=device-width, initial-scale=1"></head><body></body></html>')
- $htmldoc.IHTMLDocument2_close()
- [System.IO.Directory]::GetDirectories($htmlRoot, '*', [System.IO.SearchOption]::AllDirectories)|ForEach-Object {
- $sw = [System.IO.StreamWriter]::new([System.IO.Path]::Combine($scriptPath, [System.IO.Path]::GetFileName($_) + '.txt'), $true, [System.Text.Encoding]::UTF8)
- $sw.AutoFlush = $true
- [System.IO.Directory]::GetFiles($_, '*.html')|ForEach-Object {
- $m = [System.IO.File]::ReadAllText($_, [System.Text.Encoding]::UTF8) -match '<div class="text-center b-b b-2x b-lt">[\S\s]+?(?=<div class="p-sm">)'
- if ($m) {
- Write-Host "提取$_" -ForegroundColor Green
- try {
- $htmldoc.body.innerHTML = $Matches[0]
- $div = $htmldoc.createElement('div');
- # title
- [void]$div.appendChild($htmldoc.body.getElementsByTagName('h3')[0])
- # author
- $divAuthor = $htmldoc.createElement('div')
- [void]$divAuthor.appendChild($htmldoc.body.children[1].children[0].children[0])
- $span = $divAuthor.appendChild($htmldoc.body.children[1].children[0].children[0])
- [void]$span.removeAttributeNode($span.getAttributeNode('class'))
- [void]$div.appendChild($divAuthor)
- # signatureDiv
- $divSignature = $htmldoc.createElement('div')
- $nodeSig = $htmldoc.body.getElementsByClassName('signature')[0]
- if ($null -ne $nodeSig) {
- $span = $htmldoc.createElement('span')
- [void]$span.appendChild($htmldoc.createTextNode($nodeSig.textContent))
- [void]$divSignature.appendChild($span)
- }
- $nodeSigbox = $htmldoc.body.getElementsByClassName('signatureBox')[0]
- if ($null -ne $nodeSigbox) {
- $span = $htmldoc.createElement('span')
- [void]$span.appendChild($htmldoc.createTextNode($nodeSigbox.textContent))
- [void]$divSignature.appendChild($span)
- }
- [void]$div.appendChild($divSignature)
- # content
- $divContent = $htmldoc.body.getElementsByClassName('m-lg font14')[0]
- [void]$divContent.attributes.removeNamedItem('class')
- [void]$div.appendChild($divContent)
- $sw.WriteLine($div.outerHTML)
- Remove-Variable -Name div
- }
- catch {}
- }
- }
- $sw.Close()
- }
- Remove-Variable -Name htmldoc
复制代码
|