@@ -59,6 +59,7 @@ def parse_sub(self, response):
5959
6060
6161
62+
6263< html >
6364< head >
6465< title > Sprites < / title >
@@ -96,4 +97,91 @@ def parse_sub(self, response):
9697
9798 < / div >
9899< / body >
99- < / html >
100+ < / html >
101+
102+
103+ " Enter Url here"
104+
105+ url = ['http://www.yelp.com/biz/firebrew-virginia-beach' ,'http://www.yelp.com/biz/sabra-design-washington-3?osq=web+design+companies' ]
106+
107+ from scrapy .spider import BaseSpider
108+ from scrapy .http import Request
109+ from urlparse import urljoin
110+ from scrapy .selector import HtmlXPathSelector
111+ from urbans .items import UrbansItem
112+ import psycopg2
113+ import time
114+
115+
116+ """ Database Creation"""
117+ con = psycopg2 .connect (database = "mm" , user = "postgres" , password = "mercuryminds" , host = "localhost" )
118+ cur = con .cursor ()
119+
120+ class UrbansSpider (BaseSpider ):
121+
122+
123+ name = "ylp"
124+ start_urls = [url [i ] for i in range (len (url ))]
125+ f = open ('opt.csv' ,'w' )
126+ count = 0
127+ def parse (self , response ):
128+
129+ itm = []
130+ hxs = HtmlXPathSelector (response )
131+ item = UrbansItem ()
132+
133+ item ['bid' ] = 1
134+ item ['url' ] = response .url
135+ item ['rating' ] = float (hxs .select ('//div[@itemprop="aggregateRating"]/div/meta/@content' ).extract ()[0 ].encode ('utf-8' ).strip ())
136+ item ['rv_count' ] = int (hxs .select ('//span[@itemprop="reviewCount"]/text()' ).extract ()[0 ].encode ('utf-8' ).strip ())
137+
138+ if item ['rv_count' ] > 0 :
139+
140+ no = len (hxs .select ('//div[@itemprop="review"]' ).extract ())
141+ x = hxs .select ('//div[@class="review-list"]/ul/li' )
142+ for i in range (no ):
143+
144+ self .__class__ .count = self .__class__ .count + 1
145+ try :
146+ item ['rv_date' ] = x [0 ].select ('//meta[@itemprop="datePublished"]/@content' ).extract ()[i ].encode ('utf-8' ).strip ()
147+ except :
148+ item ['rv_date' ] = 'NULL'
149+ try :
150+ item ['rv_profile' ] = x [0 ].select ('//li[@class="user-name"]/a/text()' ).extract ()[i ].encode ('utf-8' ).strip ()
151+ except :
152+ item ['rv_profile' ] = 'NULL'
153+ try :
154+ item ['rv_rating' ] = float (x [0 ].select ('//div[@itemprop="reviewRating"]/div/meta/@content' ).extract ()[i ].encode ('utf-8' ).strip ())
155+ except :
156+ item ['rv_rating' ] = 'NULL'
157+ try :
158+ item ['rv_dc' ] = x [0 ].select ('//div[@class="review-content"]/p' ).extract ()[i ].encode ('utf-8' ).split ('lang="en">' )[1 ].replace ("<br>" ,'' ).replace ('</p>' ,'' ).replace ('\xc2 ' ,'' ).replace ('\xa0 ' ,'' ).replace ("'" ," " ).strip ()
159+ except :
160+ item ['rv_dc' ] = 'NULL'
161+
162+ sql = ("insert into public.y select %s,'%s',%s,%s,'%s','%s','%s',%s where not exists ( select * from public.y where dc='%s' and p='%s')" % (item ['bid' ],item ['url' ],item ['rating' ],item ['rv_count' ],item ['rv_date' ],item ['rv_profile' ],item ['rv_dc' ],item ['rv_rating' ],item ['rv_dc' ],item ['rv_profile' ]))
163+
164+ cur .execute (sql )
165+ con .commit ()
166+
167+ print self .__class__ .count
168+ if item ['rv_count' ] > self .__class__ .count :
169+
170+ nxt_link = hxs .select ('//a[@class="page-option prev-next"]/@href' ).extract ()[0 ].encode ('utf-8' ).strip ()
171+
172+ if nxt_link :
173+ """ Next link Processed """
174+ yield Request ( nxt_link , callback = self .parse )
175+
176+ else :
177+ self .__class__ .count = 0
178+ """ Reviews below 40 for this business"""
179+
180+ else :
181+ item ['rv_date' ] = 'NULL'
182+ item ['rv_profile' ] = 'NULL'
183+ item ['rv_rating' ] = 'NULL'
184+ item ['rv_dc' ] = 'NULL'
185+
186+
187+
0 commit comments