解析facebook开源项目排名

首页

index

分析页面元素

css1

针对标黄的标签进行解析即可

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def start(page):
cache = {}
mark = 'stargazers'
i = 1
while True:
try:
r = requests.get(page + str(i))
r.raise_for_status()
i += 1
soup = BeautifulSoup(r.text, 'html5lib')
target = soup.select("a.no-wrap.muted-link.mr-3")

for elem in target:
txt = str(elem)
group = re.search(r'href="(.*?)"', txt)
res = group.group(1)[1:]
s_index = res.rindex("/") + 1
if res[s_index:] != mark:
continue
first_s_idx = res.index("/") + 1
p_name = res[first_s_idx:res.rindex("/")]
s = txt.index('</svg>')
e = txt.rindex('</a>')
star = txt[s + 7:e]
p_star = re.sub('(\\s+)', '', star)
p_star = re.sub(',', '', p_star)
cache.update({p_name: int(p_star)})
except:
break
return cache

if __name__ == '__main__':
s_page = r'https://github.com/facebook?page='
cache = start(s_page)
print('total project ', len(cache))
cache = sorted(cache.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
top10 = cache[:10]
for ele in top10:
print(ele)

结果

res

rocksdb跌出前10,前端开源项目占绝大多数

只查后台开发语言

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def start(page, langs):
cache = {}
mark = 'stargazers'
for lang in langs:

i = 1
while True:
try:
url = (page + str(i)).format(lang)
r = requests.get(url)
r.raise_for_status()
i += 1
soup = BeautifulSoup(r.text, 'html5lib')
target = soup.select("a.no-wrap.muted-link.mr-3")

for elem in target:
txt = str(elem)
group = re.search(r'href="(.*?)"', txt)
res = group.group(1)[1:]
s_index = res.rindex("/") + 1
if res[s_index:] != mark:
continue
first_s_idx = res.index("/") + 1
p_name = res[first_s_idx:res.rindex("/")]
s = txt.index('</svg>')
e = txt.rindex('</a>')
star = txt[s + 7:e]
p_star = re.sub('(\\s+)', '', star)
p_star = re.sub(',', '', p_star)
cache.update({p_name: int(p_star)})
except:
break
return cache


if __name__ == '__main__':
s_page = r'https://github.com/facebook?language={}&page='
lang = ["java", "Python", "c++"]
cache = start(s_page, lang)
print('total project ', len(cache))
cache = sorted(cache.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
top10 = cache[:10]
for ele in top10:
print(ele)

结果

res2