不知这个效率怎样 3万行 17秒- <#*,:&cls
- @echo off
- pushd "%~dp0"
- Powershell -NoProfile -ExecutionPolicy RemoteSigned -Command ". ([ScriptBlock]::Create((Get-Content -LiteralPath \"%~0\" -ReadCount 0 | Out-String ))) "
- popd
- pause
- exit /b
- #>
- $FileList = "源文件.txt"
- $FileOut = "词频统计.txt"
- function Get-WordCount {
- [CmdletBinding()]
- param (
- [Parameter(Mandatory = $true, Position = 0, ValueFromPipeline = $true, ValueFromPipelineByPropertyName = $true)]
- [AllowEmptyCollection()]
- [AllowEmptyString()]
- [AllowNull()]
- [string[]]$Sentences,
- [Parameter(Mandatory = $false, Position = 1)]
- [ValidateNotNullOrEmpty()]
- [ValidateScript( { $_ -gt 0 })]
- [int[]]$WordLengthList = @(2, 3, 4, 5, 6, 7),
- [Parameter(Mandatory = $false, Position = 2)]
- [switch]$IncludePunctuations
- )
-
- begin {
- $dicWordCount = @{ }
- for ($i = 0; $i -lt $WordLengthList.Count; $i++) {
- $dicWordCount.Add($WordLengthList[$i], (New-Object "System.Collections.Generic.Dictionary[string, int]"))
- }
- # $reWord = New-Object System.Text.RegularExpressions.Regex -ArgumentList @("\w+")
- $reWord = New-Object System.Text.RegularExpressions.Regex -ArgumentList @("[\u4E00-\u9FA5]+")
- }
-
- process {
- foreach ($Sentence in $Sentences) {
- if ($IncludePunctuations) {
- $WordList = , $Sentence
- } else {
- $WordList = $reWord.Matches($Sentence) | ForEach-Object { $_.Value }
- }
- if ($WordList) {
- foreach ($Word in $WordList) {
- foreach ($WordLength in $WordLengthList) {
- $PosTail = $Word.Length - $WordLength
- $dicCtr = $dicWordCount.Item($WordLength)
- for ($i = 0; $i -le $PosTail; $i++) {
- $dicCtr[$Word.SubString($i, $WordLength)]++
- }
- }
- }
- }
- }
- }
-
- end {
- $dicWordCount
- }
- }
-
- $dicWordCount = Get-WordCount -Sentences (Get-Content -ReadCount 0 -Path $FileList)
- Set-Content -Value (
- $dicWordCount.Keys | Sort-Object | ForEach-Object {
- $dicCtr = $dicWordCount[$_]
- foreach ($key in $dicCtr.Keys) {
- '{0,-10} : {1}' -f $key, $dicCtr[$key]
- }
- }) -LiteralPath $FileOut
复制代码
|