本帖最后由 wxyz0001 于 2021-5-24 10:00 编辑
回复 5# netdzb
我最后一篇不知道为什么没有写入,另外乱码不知道怎么解决,有好几种编码
你看看这两个问题能否解决 | import requests | | import parsel | | | | | | def getHTMLText(url): | | try: | | r = requests.get(url, timeout=30) | | r.raise_for_status() | | r.encoding = r.apparent_encoding | | return r.text | | except: | | return "产生异常" | | | | | | url = 'https://www.dzwzzz.com/jingxuan' | | | | def get_link_title(url): | | html = getHTMLText(url) | | infoDict = {} | | sel = parsel.Selector(html) | | | | | | links = sel.css('.blkContainerSblkCon a::attr(href)').getall() | | | | titles = sel.css('.blkContainerSblkCon a::text').getall() | | for i in range(len(links)): | | key = links[i] | | val = titles[i] | | key = 'https://www.dzwzzz.com' + key[key.index('..') + 2:key.index('html')] + 'html' | | infoDict[key] = val | | return infoDict | | | | def get_chapter_text(url): | | content = getHTMLText(url) | | strs = "" | | sel = parsel.Selector(content) | | try: | | lines = sel.css('.blkContainerSblkCon p::text').getall() | | for i in range(len(lines)): | | strs += lines + '\n' | | lines = sel.css('.mt20.f14 p::text').getall() | | for i in range(len(lines)): | | strs += lines + '\n' | | lines = sel.css('.mt20.f12 p::text').getall() | | for i in range(len(lines)): | | strs += lines + '\n' | | except: | | print('获取不到div标签') | | pass | | return strs | | | | length = len(get_link_title(url)) | | count = 0 | | for key, val in get_link_title(url).items(): | | count += 1 | | print('正在打印第%d篇' % count, '总共有%d篇' % length) | | url, title = key, val | | with open('读者文摘.txt', 'a', encoding='utf-8') as f: | | f.write('##' + title + '\n' + get_chapter_text(url) + '\n')COPY |
|