11# -*- coding: utf-8 -*-
2- __author__ = 'Sean Lei'
2+ __author__ = 'Sean Lei&wuhan '
33
44from base_crawler import BaseCrawler
55from pyquery import PyQuery as Pq
6-
6+ from DAO import DAO
77
88class VideoDetailCrawler (BaseCrawler ):
9+ def __init__ (self , seed_url ):
10+ self ._seed_url = seed_url
11+ self .__video_detail = {
12+ 'id' : '' ,
13+ 'name' : '' ,
14+ 'url' : "" ,
15+ 'img' : '' ,
16+ 'maker' : ''
17+ }
18+ self .__tags = []
19+ self .__cast = []
20+
21+ def _visit_pages (self ):
22+ """
23+ @override
24+ in this class ,only one page
25+ """
26+ html = self .get_page_content_str (self ._seed_url )
27+ self ._extract_data (html )
28+
929 def _extract_data (self , doc_str ):
1030 doc = Pq (doc_str )
11- print (doc )
12- pass
31+ #url
32+ self .__video_detail ["url" ] = doc ('div>h3>a' ).attr ("href" )
33+ #image
34+ self .__video_detail ["img" ] = doc ('img' ).filter ('#video_jacket_img' ).attr ("src" )
35+ #name 片名
36+ print (doc ('div>h3>a' ).text ())
37+ self .__video_detail ["name" ] = doc ('div>h3>a' ).text ()
38+ #ID 识别码
39+ doc2 = Pq (doc ('div' ).filter ("#video_id" ))
40+ self .__video_detail ["id" ] = doc2 ("td" ).filter (".text" ).text ()
41+ #maker 制作商
42+ doc2 = Pq (doc ('div' ).filter ("#video_maker" ))
43+ self .__video_detail ["maker" ] = doc2 ("span" ).filter (".maker" ).text ()
44+ #tag
45+ doc2 = Pq (doc ('div' ).filter ("#video_genres" ))
46+ for tag in doc2 ("a[rel='category tag']" ).text ().split (" " ):
47+ if tag is not None :
48+ self .__video_tag = {
49+ 'video_id' : self .__video_detail ["id" ],
50+ 'tag' : tag
51+ }
52+ self .__tags .append (self .__video_tag )
53+ else :
54+ continue
55+ # cast #演员
56+ doc2 = Pq (doc ('div' ).filter ("#video_cast" ))
57+ for cast in doc2 ("a[rel='tag']" ).text ().split (" " ):
58+ if cast is not None :
59+ self .__video_cast = {'video_id' : self .__video_detail ["id" ], 'actor' : cast }
60+ print ("video_cast is " , self .__video_cast )
61+ self .__cast .append (self .__video_cast )
62+ print (cast )
63+ else :
64+ continue
65+ self ._video_dao ()
66+
67+ def _video_dao (self ):
68+ dao = DAO ()
69+ #表中是否已有记录
70+ query_sql = "select * from av_info_main where video_id='{}' and maker = '{}'" .format (self .__video_detail ["id" ],
71+ self .__video_detail ["maker" ])
72+
73+ if dao .execute_query (query_sql ):
74+ print ("video{} is already exists ,so next" .format (self .__cast [0 ]["video_id" ]))
75+ return
76+ #数据插入操作
77+ for video_cast1 in self .__cast :
78+ #myset = video_cast1.split()
79+ insert_sql = "INSERT INTO video_cast (video_id,actor)" \
80+ " VALUES ('{}','{}' )" .format (video_cast1 ["video_id" ], video_cast1 ["actor" ])
81+ dao .execute_dml (insert_sql )
82+ for tag1 in self .__tags :
83+ #myset = video_cast1.split()
84+ # print(video_cast1["id"], video_cast1["name"], video_cast1["link"], video_cast1["img"])
85+ insert_sql = "INSERT INTO av_tag (video_id,video_tag )" \
86+ " VALUES ('{}','{}' )" .format (tag1 ["video_id" ], tag1 ["tag" ])
87+ dao .execute_dml (insert_sql )
88+ insert_sql = "INSERT INTO av_info_main (video_id,video_name,video_src,img,maker )" \
89+ " VALUES ('{}','{}','{}','{}','{}' )" .format (self .__video_detail ["id" ], self .__video_detail ["name" ],
90+ self .__video_detail ["url" ], self .__video_detail ["img" ],
91+ self .__video_detail ["maker" ])
92+ dao .execute_dml (insert_sql )
93+
94+ if __name__ == '__main__' :
95+ v1 = VideoDetailCrawler ("http://www.javlibrary.com/cn/?v=javlij3by4" )
96+ v1 .craw ()
0 commit comments