|
| 1 | +import os |
| 2 | +import requests |
| 3 | +from ruamel.yaml import YAML |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | +BASE_URL = 'https://stackoverflow.com/questions/tagged/c' |
| 7 | +SORT = '?sort=votes' |
| 8 | +PAGE = '&page=' |
| 9 | +PAGE_SIZE_URL = '&pageSize=' |
| 10 | + |
| 11 | +PAGE_SIZE = 15 |
| 12 | +NUM_ANSWERS = 3 |
| 13 | + |
| 14 | +headers = { |
| 15 | + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' |
| 16 | +} |
| 17 | + |
| 18 | +def crawl_pages(num_pages): |
| 19 | + start = 1 |
| 20 | + current_page = start |
| 21 | + end = start + num_pages |
| 22 | + while current_page != end: |
| 23 | + try: |
| 24 | + page_url = BASE_URL + SORT + PAGE + str(current_page) + PAGE_SIZE_URL + str(PAGE_SIZE) |
| 25 | + source_code = requests.get(page_url, headers=headers, timeout=10).text |
| 26 | + soup = BeautifulSoup(source_code, 'html.parser') |
| 27 | + print('crawling page ' + str(current_page) + ': ' + page_url) |
| 28 | + q_no = 0 |
| 29 | + for link in soup.find_all('a', {'class': 'question-hyperlink'}): |
| 30 | + if q_no == PAGE_SIZE: |
| 31 | + break |
| 32 | + url = 'http://stackoverflow.com/' + link.get('href') |
| 33 | + title = link.get_text() |
| 34 | + print("------------------------------") |
| 35 | + print(title) |
| 36 | + parse_question(url, title) |
| 37 | + q_no += 1 |
| 38 | + current_page += 1 |
| 39 | + except (KeyboardInterrupt, EOFError, SystemExit): |
| 40 | + print("\nStopped by user!") |
| 41 | + break |
| 42 | + |
| 43 | +def parse_question(url, title): |
| 44 | + page = requests.get(url, headers=headers, timeout=10) |
| 45 | + soup = BeautifulSoup(page.content, 'html.parser') |
| 46 | + question = soup.find('div', class_='postcell') |
| 47 | + if question is not None: |
| 48 | + question = list(question)[1].get_text() |
| 49 | + answers = soup.find_all('div', class_='answercell') |
| 50 | + end = len(answers) |
| 51 | + if end > NUM_ANSWERS: |
| 52 | + end = NUM_ANSWERS |
| 53 | + print(question) |
| 54 | + print("List of answers:\n\n") |
| 55 | + for i in range(0, end): |
| 56 | + answer = answers[i].find('div', class_='post-text').get_text() |
| 57 | + entry = [title, answer] |
| 58 | + print("===>") |
| 59 | + print(title) |
| 60 | + print(answer) |
| 61 | + |
| 62 | +def main(): |
| 63 | + crawl_pages(2) |
| 64 | + print('\nDone!') |
| 65 | + |
| 66 | +main() |
0 commit comments