1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| def start(page, langs): cache = {} mark = 'stargazers' for lang in langs:
i = 1 while True: try: url = (page + str(i)).format(lang) r = requests.get(url) r.raise_for_status() i += 1 soup = BeautifulSoup(r.text, 'html5lib') target = soup.select("a.no-wrap.muted-link.mr-3")
for elem in target: txt = str(elem) group = re.search(r'href="(.*?)"', txt) res = group.group(1)[1:] s_index = res.rindex("/") + 1 if res[s_index:] != mark: continue first_s_idx = res.index("/") + 1 p_name = res[first_s_idx:res.rindex("/")] s = txt.index('</svg>') e = txt.rindex('</a>') star = txt[s + 7:e] p_star = re.sub('(\\s+)', '', star) p_star = re.sub(',', '', p_star) cache.update({p_name: int(p_star)}) except: break return cache
if __name__ == '__main__': s_page = r'https://github.com/facebook?language={}&page=' lang = ["java", "Python", "c++"] cache = start(s_page, lang) print('total project ', len(cache)) cache = sorted(cache.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) top10 = cache[:10] for ele in top10: print(ele)
|
能否参与评论,且看个人手段。