- """
- python采集55188.com论坛URL.py
- http://bbs.bathome.net/thread-40289-1-1.html
-
- 2016年5月2日 17:28:47 codegay
-
- 参考资料: Python3学习笔记(urllib模块的使用)
- http://www.cnblogs.com/Lands-ljk/p/5447127.html
- """
-
- print("程序运行中...")
- from urllib import request
- from time import sleep
- import re
-
- rooturl="http://www.55188.com/"
- forumurl="http://www.55188.com/forum-111-{}.html"
- tidurl="http://www.55188.com/viewthread.php?tid={}\n"
-
- with open("result.txt","w+") as f:
- for r in range(1,210):
- r=request.urlopen(forumurl.format(r))
- txt=r.read().decode("gbk")
- rec=re.compile('''viewthread.php\?tid=(\d+).+#anchorlink''')
- tid=set(rec.findall(txt))#去重复
- for id in tid:
- f.write(tidurl.format(id))
- #sleep(1)
-
- input("运行结束,回车退出")
复制代码
|