11" Enter Url here"
22
3-
43url = ['http://www.tripadvisor.com/Restaurant_Review-g42251-d4164754-Reviews-Taboon-Grand_Blanc_Michigan.html' ,'http://www.tripadvisor.com/Restaurant_Review-g28970-d2137408-Reviews-Lincoln_DC-Washington_DC_District_of_Columbia.html' ,'http://www.tripadvisor.com/Restaurant_Review-g29556-d416774-Reviews-Zingerman_s_Delicatessen-Ann_Arbor_Michigan.html' ,'http://www.tripadvisor.com/Restaurant_Review-g58277-d3529482-Reviews-FireBrew-Virginia_Beach_Virginia.html' ]
54
5+
66from scrapy .spider import BaseSpider
77from scrapy .http import Request
88from urlparse import urljoin
2222class YelpSpider (BaseSpider ):
2323
2424
25- name = "trip "
25+ name = "tripss "
2626 start_urls = [url [i ] for i in range (len (url ))]
2727
2828 def parse (self , response ):
@@ -39,6 +39,10 @@ def parse(self, response):
3939 cur .execute (qu )
4040 rows = cur .fetchall ()
4141 li = [r [0 ] for r in rows ]
42+
43+
44+
45+
4246# print li
4347 for i in range (len (li )):
4448# print li[i], item['url']
@@ -54,10 +58,18 @@ def parse(self, response):
5458 # print li1
5559 if len (li1 ) > 0 :
5660 item ['bid' ] = li1 [0 ]
61+ qu2 = ("select max(d) from social_data.tripadvisor1 where id =%d" )% item ['bid' ]
62+ cur .execute (qu2 )
63+ rows2 = cur .fetchall ()
64+ max_date = [i [0 ] for i in rows2 ][0 ]
65+ print qu2 ,rows2 ,max_date
66+
5767 if item ['bid' ] > 0 :
5868 print " ID assigned"
5969 else :
6070 item ['bid' ] = input ("Enter the Business Id here for URL: %s : " % item ['url' ])
71+ max_date = datetime .strptime ('' ,'' )
72+ print max_date
6173
6274 print item ['bid' ]
6375
@@ -69,28 +81,25 @@ def parse(self, response):
6981
7082 if item ['rv_count' ] > 0 :
7183
72- no = len (hxs .select ('//div[@class="reviewSelector "]' ).extract ())
84+ no = hxs .select ('//div[@class="review basic_review inlineReviewUpdate provider0"]' ).extract ()
85+
86+ for i in range (len (no )):
87+
7388
74- for i in range (no ):
75- # cur.execute("select max(d) from social_data.tripadvisor1 where id='%s'" %item['bid'])
76- # r = cur.fetchall()
77- # m_dt = r[0][0]
78-
7989 try :
8090 xd = hxs .select ('//span[@class="ratingDate"]' ).extract ()[i ].encode ('utf-8' ).split ('Reviewed' )[1 ].split ('\n ' )[0 ].replace (',' ,'' ).replace ('\n ' ,'' ).strip ()
81- date = datetime . strptime ( xd , '%B %d %Y' )
82- item ['rv_date' ] = str (date )
91+
92+ item ['rv_date' ] = str (datetime . strptime ( xd , '%B %d %Y' ) )
8393
8494
8595 except :
86- date = datetime .strptime ('9999-01-01' ,'%Y-%d-%m' )
8796 item ['rv_date' ] = '0001-01-01 00:00:00'
97+ current_date = datetime .strptime (item ['rv_date' ],'%Y-%m-%d %X' )
8898
8999 try :
90100 item ['rv_profile' ] = hxs .select ('//div[@class="username mo"]/span/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
91101 except :
92- item ['rv_profile' ] = 'NULL'
93-
102+ item ['rv_profile' ] = 'A TripAdvisor reviewer on Facebook'
94103 try :
95104 item ['rv_heading' ] = hxs .select ('//span[@class="noQuotes"]/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
96105 except :
@@ -101,18 +110,25 @@ def parse(self, response):
101110 except :
102111 item ['rv_rating' ] = '0.0'
103112 try :
104- item ['rv_dc' ] = hxs .select ('//p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
113+ item ['rv_dc' ] = hxs .select ('//div[@class="entry"]/ p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
105114 except :
106115 item ['rv_dc' ] = 'NULL'
107- # if date >= m_dt:
108- sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
109- # else:
110- # break
111-
112- cur .execute (sql )
113- con .commit ()
116+ last_date = current_date
117+
114118
115- print item ['url' ],item ['rv_profile' ]
119+ """ It Only insert the new Feeds """
120+ print current_date , max_date
121+ if current_date >= max_date :
122+
123+ sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
124+
125+ cur .execute (sql )
126+ con .commit ()
127+ else :
128+ break
129+ print "No updated review are here"
130+
131+ # print item['url'],item['rv_profile']
116132
117133
118134 """ Parse Next link"""
@@ -129,9 +145,18 @@ def parse(self, response):
129145
130146 print nxt_link
131147
148+
132149 if nxt_link :
133150 """ Next link Processed """
134- yield Request ( nxt_link , callback = self .parse )
151+
152+ if last_date > max_date and '1900-01-01 00:00:00' not in str (max_date ):
153+ print "enter 1st"
154+ yield Request ( nxt_link , callback = self .parse )
155+ elif '1900-01-01 00:00:00' in str (max_date ):
156+ print "enter 2nd"
157+ yield Request ( nxt_link , callback = self .parse_sub )
158+ else :
159+ print " Do nothing No other pages available"
135160 else :
136161 print " Progress Completed "
137162
@@ -144,6 +169,147 @@ def parse(self, response):
144169 item ['rv_dc' ] = 'NULL'
145170 item ['rv_heading' ] = 'NULL'
146171
172+ def parse_sub (self , response ):
173+
174+ print "Sub parse Called"
175+
176+ itm = []
177+ hxs1 = HtmlXPathSelector (response )
178+ item = TripItem ()
179+ item ['bid' ] = 0
180+ item ['url' ] = response .url
181+
182+ """Getting the Business Id from the DB if exists """
183+
184+ qu = ("select distinct u from social_data.tripadvisor1" )
185+ cur .execute (qu )
186+ rows = cur .fetchall ()
187+ li = [r [0 ] for r in rows ]
188+
189+
190+
191+ # print li
192+ for i in range (len (li )):
193+ # print li[i], item['url']
194+ try :
195+ lis = li [i ].split ('-or' )[1 ].split ('-' )[1 ]
196+ except :
197+ lis = li [i ].split ('Reviews-' )[1 ].split ('-' )[0 ]
198+ if item ['url' ].find (lis )> 0 :
199+ qu1 = ("select distinct id from social_data.tripadvisor1 where u='%s'" )% li [i ]
200+ cur .execute (qu1 )
201+ rows1 = cur .fetchall ()
202+ li1 = [i [0 ] for i in rows1 ]
203+ # print li1
204+ if len (li1 ) > 0 :
205+ item ['bid' ] = li1 [0 ]
206+ if item ['bid' ] > 0 :
207+ print " ID assigned"
208+ else :
209+ item ['bid' ] = input ("Enter the Business Id here for URL: %s : " % item ['url' ])
210+
211+
212+ print item ['bid' ]
213+
214+
215+
216+
217+ item ['rating' ] = float (hxs1 .select ('//div[@class="rs rating"]/span/img/@content' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ())
218+ item ['rv_count' ] = int (hxs1 .select ('//div[@class="rs rating"]/a/span/text()' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ())
219+
220+ if item ['rv_count' ] > 0 :
221+
222+ no = hxs1 .select ('//div[@class="review basic_review inlineReviewUpdate provider0"]' ).extract ()
223+
224+ for i in range (len (no )):
225+
226+
227+ try :
228+ xd = hxs1 .select ('//span[@class="ratingDate"]' ).extract ()[i ].encode ('utf-8' ).split ('Reviewed' )[1 ].split ('\n ' )[0 ].replace (',' ,'' ).replace ('\n ' ,'' ).strip ()
229+
230+ item ['rv_date' ] = str (datetime .strptime (xd ,'%B %d %Y' ))
231+
232+
233+ except :
234+ item ['rv_date' ] = '0001-01-01 00:00:00'
235+ current_date = datetime .strptime (item ['rv_date' ],'%Y-%m-%d %X' )
236+
237+ try :
238+ item ['rv_profile' ] = hxs1 .select ('//div[@class="username mo"]/span/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
239+ except :
240+ item ['rv_profile' ] = 'A TripAdvisor reviewer on Facebook'
241+ try :
242+ item ['rv_heading' ] = hxs1 .select ('//span[@class="noQuotes"]/text()' ).extract ()[i ].encode ('ascii' , 'ignore' ).replace ('.' ,'' ).replace ("'" ," " ).strip ()
243+ except :
244+ item ['rv_heading' ] = 'NULL'
245+
246+ try :
247+ item ['rv_rating' ] = float (hxs1 .select ('//div[@class="rating reviewItemInline"]/span/img/@alt' ).extract ()[i ].encode ('ascii' , 'ignore' ).split (' ' )[0 ].strip ())
248+ except :
249+ item ['rv_rating' ] = '0.0'
250+ try :
251+ item ['rv_dc' ] = hxs1 .select ('//div[@class="entry"]/p[@class="partial_entry"]' ).extract ()[i ].encode ('ascii' , 'ignore' ).split ('\n ' )[1 ].replace ("'" ," " ).replace ('.' ,'' ).strip ()
252+ except :
253+ item ['rv_dc' ] = 'NULL'
254+ last_date = current_date
255+
256+
257+ """ It Only insert the new Feeds """
258+
259+
260+
261+ sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_heading' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_profile' ]))
262+
263+ cur .execute (sql )
264+ con .commit ()
265+
266+ # print item['url'],item['rv_profile']
267+
268+
269+ """ Parse Next link"""
270+
271+
272+
273+ try :
274+ link = hxs1 .select ('//div[@class="pgLinks"]/a[@class="guiArw sprite-pageNext "]/@href' ).extract ()[0 ].encode ('ascii' , 'ignore' ).strip ()
275+ nxt_link = urljoin (response .url ,link )
276+ except :
277+ nxt_link = []
278+
279+
280+
281+ print nxt_link
282+
283+
284+ if nxt_link :
285+ """ Next link Processed """
286+ yield Request ( nxt_link , callback = self .parse_sub )
287+
288+ else :
289+ print " Progress Completed "
290+
291+
292+
293+ else :
294+ item ['rv_date' ] = 'NULL'
295+ item ['rv_profile' ] = 'NULL'
296+ item ['rv_rating' ] = 'NULL'
297+ item ['rv_dc' ] = 'NULL'
298+ item ['rv_heading' ] = 'NULL'
299+
300+
147301
148302
149303
304+ CREATE TABLE social_data .tripadvisor
305+ (
306+ bid integer ,
307+ url text ,
308+ rating double precision ,
309+ rv_count integer ,
310+ rv_date timestamp without time zone ,
311+ rv_heading text ,
312+ rv_rating double precision ,
313+ rv_desc text ,
314+ rv_user text
315+ )
0 commit comments