根據(jù)PythonWeekly每期推薦的文章或教程的引用來源做簡單統(tǒng)計儒搭,推測國外python內(nèi)容分享活躍站點(diǎn)棉浸,并簡單比較優(yōu)質(zhì)python博客或內(nèi)容發(fā)布地的變化揖曾。
舊為2012-2013年左右,新為今年(2017)悯许。格式:[(站點(diǎn)根域名仆嗦, 累計次數(shù)),...]
先壕。
(新舊統(tǒng)計期數(shù)不同瘩扼,新:最近20期;舊:最初的81期启上。只統(tǒng)計文章和教程的鏈接邢隧。)
可以看出Github和Youtube一直是活躍的分享站點(diǎn)。wordpress的份額變少了冈在,blogspot基本看不到了倒慧,而medium成為風(fēng)頭正勁。變化還是很大的。
- PythonWeekly的內(nèi)容相當(dāng)不錯纫谅,可是有時太懶沒看炫贤,錯過不少期,又懶得在郵件列表中一個一個翻付秕,于是做了個爬蟲兰珍,只爬取其中的推薦文章/教程部分,生成一個可以快速瀏覽的markdown列表询吴,便于篩選感興趣的文章掠河,在瀏覽器中打開也很方便搜索。不過猛计,看英文標(biāo)題的掃視速度還是太慢唠摹,申請了個百度翻譯的api機(jī)翻了標(biāo)題,雖然不如谷歌翻譯奉瘤,也勉強(qiáng)能看勾拉,畢竟看中文還是舒服些。點(diǎn)擊下方的鏈接就可以查看匯總文章了盗温。(簡書放不下就放到了github上了藕赞,顯示效果都差不多)
近期文章匯總
初期文章匯總
- 目前能找到的只有初始81期(對應(yīng)
initNews.py
)和最近的20期(對應(yīng)recentNews.py
)(PythonWeekly 一周一期)。使用代碼需要替換入你自己的百度翻譯api秘鑰卖局。 -
initNews.py
和recentNews.py
基本差不多斧蜕。不過,后者只使用了單線程砚偶,比較慢惩激,但是量少,也沒花多少時間蟹演。前者使用了多線程,速度提升很明顯顷蟀,雖然是4倍的量酒请,但差不多一下子就完了。(此外鸣个,百度翻譯api中的query使用'\n'分隔可一次翻譯多個語句)
代碼
initNews.py
import requests
from bs4 import BeautifulSoup
import re
# 請?zhí)鎿Q為你的秘鑰
appid = 'yourappid'
secretKey = 'yoursecretkey'
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'user-agent': ua.chrome}
pythonweekly_init_issues_archive_url = (
'http://www.pythonweekly.com/archive/')
def get_pythonweekly_init_issues_urls():
url = pythonweekly_init_issues_archive_url
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
return [[
a.text.split(' ')[-1].strip(),
''.join([url, a['href']]),
] for a in soup.select('li a')]
pythonweekly_init_issues_urls = get_pythonweekly_init_issues_urls()
def get_single_issue_info(issue):
try:
# issue = [text, url, list]
url = issue[1]
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
content = soup.select_one('td .defaultText')
try:
submenus = [i.text for i in content.find_all('strong')]
for index, menu in enumerate(submenus):
if re.search('[Aa]rticles', menu):
break
start_text = [menu,]
end_text = submenus[index+1]
except:
# 臟改
start_text = ['Articles,\xa0Tutorials and Talks',
'\xa0Tutorials and Talks', # 應(yīng)對11,12.html
'Articles Tutorials and Talks']
end_text = 'Interesting Projects, Tools and Libraries'
flag = 0
list_ = []
for s in content.find_all('span'):
if not flag:
if s.text not in start_text:
continue
else:
flag = 1
continue
if s.text == end_text:
break
try:
one = [s.text.strip(), s.find('a')['href']]
# print(one)
list_.append(one)
except TypeError:
pass
# return list_
issue.append(list_)
print('下載完成', issue[0])
except Exception as e:
print('wrong: ', issue[0], '\n', e)
from multiprocessing.dummy import Pool
pool = Pool(30)
pool.map(get_single_issue_info, pythonweekly_init_issues_urls)
pythonweekly_init_issues = pythonweekly_init_issues_urls
def baidu_translates(query):
'''
http://api.fanyi.baidu.com/api/trans/product/apidoc
'''
from hashlib import md5
import random
url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
fromLang = 'en'
toLang = 'zh'
salt = random.randint(32768, 65536)
sign = appid + query + str(salt) + secretKey
m1 = md5()
m1.update(sign.encode('utf-8'))
sign = m1.hexdigest()
params = {'appid':appid,
'q':query,
'from':fromLang,
'to':toLang,
'salt':str(salt),
'sign':sign,}
res = requests.get(url, params=params)
return res.json()['trans_result']
def get_translate(issue):
articles = issue[-1]
try:
result = baidu_translates('\n'.join([i[0] for i in articles]))
for index, i in enumerate(articles):
i.append(result[index]['dst'])
print('翻譯完成', issue[0])
except:
print('**翻譯失敗**', issue[0])
pool.map(get_translate, pythonweekly_init_issues)
from jinja2 import Template
table = """
<table>
{% for issue_num, issue_href, article_lists in issues %}
{% for article_name, article_href, article_chinese in article_lists %}
<tr>
<td><a href='{{issue_href}}'>{{ issue_num }}</a></td>
<td><a href='{{article_href}}'>{{ article_name }}</a></td>
<td><a href='{{article_href}}'>{{ article_chinese }}</a></td>
</tr>
{% endfor %}
{% endfor %}
</table>
"""
template = Template(table)
t = template.render(issues=pythonweekly_init_issues)
import time
with open('pythonweekly_init ' + time.ctime().replace(':', '_') + '.html', 'w', encoding='utf-8') as f:
f.write(t)
pool.close()
pool.join()
# https://stackoverflow.com/questions/9626535/get-domain-name-from-url
# get_host = requests.urllib3.util.url.get_host # get_host(i[1])[1]
import tldextract
host_list = [
tldextract.extract(i[1]).domain
for *_, articles in pythonweekly_init_issues for i in articles ]
from collections import Counter
counter = Counter(host_list)
print(counter.most_common(20))
with open('pythonweekly_init.md', 'w', encoding='utf-8') as f:
f.write(u'### PythonWeekly初期文章教程匯總\n')
f.write(u'| 期號 | 英文名 | 中文名|\n')
f.write(u'| ------------- |:-------------:| -----:|\n')
for issue_num, issue_href, article_lists in pythonweekly_init_issues:
for article_name, article_href, article_chinese in article_lists:
f.write(('| [{issue_num}]({issue_href}) '
'| [{article_name}]({article_href}) '
'| [{article_chinese}]({article_href}) '
'| \n').format(**locals()))
recentNews.py
import requests
from bs4 import BeautifulSoup
import re
# 請?zhí)鎿Q為你的秘鑰
appid = 'yourappid'
secretKey = 'yoursecretkey'
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'user-agent': ua.chrome}
pythonweekly_recent_issues_archive_url = (
'http://us2.campaign-archive2.com/home/'
'?u=e2e180baf855ac797ef407fc7&id=9e26887fc5')
def get_pythonweekly_recent_issues_urls():
res = requests.get(pythonweekly_recent_issues_archive_url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
return [[
a.text.split(' ')[-1].strip(),
a['href'],
]
for a in soup.select('li a')]
pythonweekly_recent_issues_urls = get_pythonweekly_recent_issues_urls()
def get_single_issue_info(url):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
content = soup.select_one('td .defaultText')
submenus = [i.text for i in content.find_all('span', attrs={'style':"color:#B22222"})]
for index, i in enumerate(submenus):
if re.search('[Aa]rticles', i):
break
start_text = i
end_text = submenus[index+1]
flag = 0
list_ = []
for s in content.find_all('span'):
if not flag:
if s.text != start_text:
continue
else:
flag = 1
continue
if s.text == end_text:
break
try:
one = [s.text.strip(), s.find('a')['href']]
# print(one)
list_.append(one)
except TypeError:
pass
return list_
for i in pythonweekly_recent_issues_urls:
# [text, url, list]
print(i[0])
i.append(get_single_issue_info(i[1]))
pythonweekly_recent_issues = pythonweekly_recent_issues_urls
def baidu_translate(query):
'''
http://api.fanyi.baidu.com/api/trans/product/apidoc
'''
from hashlib import md5
import random
url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
fromLang = 'en'
toLang = 'zh'
salt = random.randint(32768, 65536)
sign = appid + query + str(salt) + secretKey
m1 = md5()
m1.update(sign.encode('utf-8'))
sign = m1.hexdigest()
params = {'appid':appid,
'q':query,
'from':fromLang,
'to':toLang,
'salt':str(salt),
'sign':sign,}
res = requests.get(url, params=params)
return res.json()['trans_result'][0]['dst']
for *_, articles in pythonweekly_recent_issues:
for i in articles:
i.append(baidu_translate(i[0]))
print('done')
from jinja2 import Template
table = """
<table>
{% for issue_num, issue_href, article_lists in issues %}
{% for article_name, article_href, article_chinese in article_lists %}
<tr>
<td><a href='{{issue_href}}'>{{ issue_num }}</a></td>
<td><a href='{{article_href}}'>{{ article_name }}</a></td>
<td><a href='{{article_href}}'>{{ article_chinese }}</a></td>
</tr>
{% endfor %}
{% endfor %}
</table>
"""
template = Template(table)
t = template.render(issues=pythonweekly_recent_issues)
import time
with open('pythonweekly_recent ' + time.ctime().replace(':', '_') + '.html', 'w', encoding='utf-8') as f:
f.write(t)
import tldextract
host_list = [
tldextract.extract(i[1]).domain
for *_, articles in pythonweekly_recent_issues for i in articles ]
from collections import Counter
counter = Counter(host_list)
counter.most_common(20)
with open('pythonweekly_recent.md', 'w', encoding='utf-8') as f:
f.write(u'### PythonWeekly文章教程近期匯總\n')
f.write(u'| 期號 | 英文名 | 中文名|\n')
f.write(u'| ------------- |:-------------:| -----:|\n')
for issue_num, issue_href, article_lists in pythonweekly_recent_issues:
for article_name, article_href, article_chinese in article_lists:
f.write(('| [{issue_num}]({issue_href}) '
'| [{article_name}]({article_href}) '
'| [{article_chinese}]({article_href}) '
'| \n').format(**locals()))