Skip to content

Commit 02e477a

Browse files
committed
time:2014-5-29 23:15:45
update the video_detail_crwaler.py app.py video_list_crawler.py commit new file : DAO.py
1 parent 9d48d6d commit 02e477a

File tree

4 files changed

+163
-9
lines changed

4 files changed

+163
-9
lines changed

DAO.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
__author__ = 'wuhan'
2+
import mysql.connector
3+
import copy
4+
5+
class DAO(object):
6+
def __init__(self):
7+
self.__user = 'avoper'
8+
self.__pwd = '******'
9+
self.__db_host = '127.0.0.1'
10+
self.__db = 'avdb'
11+
12+
def execute_dml(self, sql):
13+
14+
"""
15+
execute sql update and insert
16+
:param sql:
17+
:return:
18+
"""
19+
20+
cnx = mysql.connector.connect(user=self.__user, password=self.__pwd, host=self.__db_host, database=self.__db)
21+
print(sql)
22+
cursor = cnx.cursor()
23+
try:
24+
cursor.execute(sql)
25+
except mysql.connector.Error as sql_err:
26+
print("Error: {}".format(sql_err.msg))
27+
log_sql = open('test.log', 'a')
28+
log_sql.write("Error: {} \n in the insert/update sql :{}".format(sql_err.msg, sql))
29+
log_sql.close()
30+
cnx.commit()
31+
cursor.close()
32+
cnx.close
33+
34+
35+
def execute_query(self, sql):
36+
"""
37+
execute sql query
38+
:param sql:
39+
:return: result_list
40+
"""
41+
42+
cnx = mysql.connector.connect(user=self.__user, password=self.__pwd, host=self.__db_host, database=self.__db)
43+
print(sql)
44+
cursor = cnx.cursor()
45+
try:
46+
cursor.execute(sql)
47+
result_rows = copy.deepcopy(cursor.fetchone())
48+
if result_rows:
49+
print(result_rows)
50+
cursor.close()
51+
cnx.close
52+
return result_rows
53+
if result_rows is None:
54+
print("reuslt is null")
55+
cursor.close()
56+
cnx.close
57+
return None
58+
except mysql.connector.Error as sql_err:
59+
print("Error: {}".format(sql_err.msg))
60+
log_sql = open('test.log', 'a')
61+
log_sql.write("Error: {} \n in the insert/update sql :{}".format(sql_err.msg, sql))
62+
log_sql.close()
63+
64+
#cursor.close()
65+
66+
67+
if __name__ == '__main__':
68+
Dao = DAO()
69+
Dao.execute_query("select * from av_tag")

app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
__author__ = 'Sean Lei'
33

44
from video_list_crawler import VideoListCrawler
5-
5+
from video_detail_crawler import *
66

77
if __name__ == '__main__':
88
crawler = VideoListCrawler()
99
crawler.craw()
10-
print(crawler.detail_info_urls)
10+
##crawlerdetal = VideoDetailCrawler()

video_detail_crawler.py

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,96 @@
11
# -*- coding: utf-8 -*-
2-
__author__ = 'Sean Lei'
2+
__author__ = 'Sean Lei&wuhan'
33

44
from base_crawler import BaseCrawler
55
from pyquery import PyQuery as Pq
6-
6+
from DAO import DAO
77

88
class VideoDetailCrawler(BaseCrawler):
9+
def __init__(self, seed_url):
10+
self._seed_url = seed_url
11+
self.__video_detail = {
12+
'id': '',
13+
'name': '',
14+
'url': "",
15+
'img': '',
16+
'maker': ''
17+
}
18+
self.__tags = []
19+
self.__cast = []
20+
21+
def _visit_pages(self):
22+
"""
23+
@override
24+
in this class ,only one page
25+
"""
26+
html = self.get_page_content_str(self._seed_url)
27+
self._extract_data(html)
28+
929
def _extract_data(self, doc_str):
1030
doc = Pq(doc_str)
11-
print(doc)
12-
pass
31+
#url
32+
self.__video_detail["url"] = doc('div>h3>a').attr("href")
33+
#image
34+
self.__video_detail["img"] = doc('img').filter('#video_jacket_img').attr("src")
35+
#name 片名
36+
print(doc('div>h3>a').text())
37+
self.__video_detail["name"] = doc('div>h3>a').text()
38+
#ID 识别码
39+
doc2 = Pq(doc('div').filter("#video_id"))
40+
self.__video_detail["id"] = doc2("td").filter(".text").text()
41+
#maker 制作商
42+
doc2 = Pq(doc('div').filter("#video_maker"))
43+
self.__video_detail["maker"] = doc2("span").filter(".maker").text()
44+
#tag
45+
doc2 = Pq(doc('div').filter("#video_genres"))
46+
for tag in doc2("a[rel='category tag']").text().split(" "):
47+
if tag is not None:
48+
self.__video_tag = {
49+
'video_id': self.__video_detail["id"],
50+
'tag': tag
51+
}
52+
self.__tags.append(self.__video_tag)
53+
else:
54+
continue
55+
# cast #演员
56+
doc2 = Pq(doc('div').filter("#video_cast"))
57+
for cast in doc2("a[rel='tag']").text().split(" "):
58+
if cast is not None:
59+
self.__video_cast = {'video_id': self.__video_detail["id"], 'actor': cast}
60+
print("video_cast is ", self.__video_cast)
61+
self.__cast.append(self.__video_cast)
62+
print(cast)
63+
else:
64+
continue
65+
self._video_dao()
66+
67+
def _video_dao(self):
68+
dao = DAO()
69+
#表中是否已有记录
70+
query_sql = "select * from av_info_main where video_id='{}' and maker = '{}'".format(self.__video_detail["id"],
71+
self.__video_detail["maker"])
72+
73+
if dao.execute_query(query_sql):
74+
print("video{} is already exists ,so next".format(self.__cast[0]["video_id"]))
75+
return
76+
#数据插入操作
77+
for video_cast1 in self.__cast:
78+
#myset = video_cast1.split()
79+
insert_sql = "INSERT INTO video_cast (video_id,actor)" \
80+
" VALUES ('{}','{}' )".format(video_cast1["video_id"], video_cast1["actor"])
81+
dao.execute_dml(insert_sql)
82+
for tag1 in self.__tags:
83+
#myset = video_cast1.split()
84+
# print(video_cast1["id"], video_cast1["name"], video_cast1["link"], video_cast1["img"])
85+
insert_sql = "INSERT INTO av_tag (video_id,video_tag )" \
86+
" VALUES ('{}','{}' )".format(tag1["video_id"], tag1["tag"])
87+
dao.execute_dml(insert_sql)
88+
insert_sql = "INSERT INTO av_info_main (video_id,video_name,video_src,img,maker )" \
89+
" VALUES ('{}','{}','{}','{}','{}' )".format(self.__video_detail["id"], self.__video_detail["name"],
90+
self.__video_detail["url"], self.__video_detail["img"],
91+
self.__video_detail["maker"])
92+
dao.execute_dml(insert_sql)
93+
94+
if __name__ == '__main__':
95+
v1 = VideoDetailCrawler("http://www.javlibrary.com/cn/?v=javlij3by4")
96+
v1.craw()

video_list_crawler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pyquery import PyQuery as Pq
55

66
from base_crawler import BaseCrawler
7-
7+
from video_detail_crawler import VideoDetailCrawler
88

99
class VideoListCrawler(BaseCrawler):
1010
def __init__(self):
@@ -18,7 +18,7 @@ def _generate_seed_url(self):
1818
"""
1919
generate all url to visit
2020
"""
21-
21+
##from page 1 to anypage which < 200
2222
for page_no in range(1, 2):
2323
self._seed_url.append(self._domain + self._info_uri + page_no.__str__())
2424

@@ -30,4 +30,5 @@ def _extract_data(self, doc_str):
3030
video_id = video_id[4:]
3131
detail_url = self._domain + self._detail_uri + video_id
3232
self.detail_info_urls.append(detail_url)
33-
33+
crawler = VideoDetailCrawler(detail_url)
34+
crawler.craw()

0 commit comments

Comments
 (0)