Skip to content

Commit 0b68c56

Browse files
authored
Create crawler.py
1 parent 00450aa commit 0b68c56

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed

crawler.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import os
2+
import requests
3+
from ruamel.yaml import YAML
4+
from bs4 import BeautifulSoup
5+
6+
BASE_URL = 'https://stackoverflow.com/questions/tagged/c'
7+
SORT = '?sort=votes'
8+
PAGE = '&page='
9+
PAGE_SIZE_URL = '&pageSize='
10+
11+
PAGE_SIZE = 15
12+
NUM_ANSWERS = 3
13+
14+
headers = {
15+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
16+
}
17+
18+
def crawl_pages(num_pages):
19+
start = 1
20+
current_page = start
21+
end = start + num_pages
22+
while current_page != end:
23+
try:
24+
page_url = BASE_URL + SORT + PAGE + str(current_page) + PAGE_SIZE_URL + str(PAGE_SIZE)
25+
source_code = requests.get(page_url, headers=headers, timeout=10).text
26+
soup = BeautifulSoup(source_code, 'html.parser')
27+
print('crawling page ' + str(current_page) + ': ' + page_url)
28+
q_no = 0
29+
for link in soup.find_all('a', {'class': 'question-hyperlink'}):
30+
if q_no == PAGE_SIZE:
31+
break
32+
url = 'http://stackoverflow.com/' + link.get('href')
33+
title = link.get_text()
34+
print("------------------------------")
35+
print(title)
36+
parse_question(url, title)
37+
q_no += 1
38+
current_page += 1
39+
except (KeyboardInterrupt, EOFError, SystemExit):
40+
print("\nStopped by user!")
41+
break
42+
43+
def parse_question(url, title):
44+
page = requests.get(url, headers=headers, timeout=10)
45+
soup = BeautifulSoup(page.content, 'html.parser')
46+
question = soup.find('div', class_='postcell')
47+
if question is not None:
48+
question = list(question)[1].get_text()
49+
answers = soup.find_all('div', class_='answercell')
50+
end = len(answers)
51+
if end > NUM_ANSWERS:
52+
end = NUM_ANSWERS
53+
print(question)
54+
print("List of answers:\n\n")
55+
for i in range(0, end):
56+
answer = answers[i].find('div', class_='post-text').get_text()
57+
entry = [title, answer]
58+
print("===>")
59+
print(title)
60+
print(answer)
61+
62+
def main():
63+
crawl_pages(2)
64+
print('\nDone!')
65+
66+
main()

0 commit comments

Comments
 (0)