Skip to content

Commit 6d8cfbf

Browse files
hi
1 parent 1379f28 commit 6d8cfbf

File tree

1 file changed

+89
-1
lines changed

1 file changed

+89
-1
lines changed

ur.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def parse_sub(self, response):
5959

6060

6161

62+
6263
<html>
6364
<head>
6465
<title> Sprites </title>
@@ -96,4 +97,91 @@ def parse_sub(self, response):
9697

9798
</div>
9899
</body>
99-
</html>
100+
</html>
101+
102+
103+
" Enter Url here"
104+
105+
url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/sabra-design-washington-3?osq=web+design+companies']
106+
107+
from scrapy.spider import BaseSpider
108+
from scrapy.http import Request
109+
from urlparse import urljoin
110+
from scrapy.selector import HtmlXPathSelector
111+
from urbans.items import UrbansItem
112+
import psycopg2
113+
import time
114+
115+
116+
""" Database Creation"""
117+
con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost")
118+
cur = con.cursor()
119+
120+
class UrbansSpider(BaseSpider):
121+
122+
123+
name = "ylp"
124+
start_urls=[url[i] for i in range(len(url))]
125+
f=open('opt.csv','w')
126+
count = 0
127+
def parse(self, response):
128+
129+
itm=[]
130+
hxs = HtmlXPathSelector(response)
131+
item = UrbansItem()
132+
133+
item['bid'] = 1
134+
item['url'] = response.url
135+
item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('utf-8').strip())
136+
item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('utf-8').strip())
137+
138+
if item['rv_count'] > 0:
139+
140+
no = len(hxs.select('//div[@itemprop="review"]').extract())
141+
x=hxs.select('//div[@class="review-list"]/ul/li')
142+
for i in range(no):
143+
144+
self.__class__.count = self.__class__.count + 1
145+
try:
146+
item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('utf-8').strip()
147+
except:
148+
item['rv_date'] = 'NULL'
149+
try:
150+
item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('utf-8').strip()
151+
except:
152+
item['rv_profile'] = 'NULL'
153+
try:
154+
item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('utf-8').strip())
155+
except:
156+
item['rv_rating'] = 'NULL'
157+
try:
158+
item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('utf-8').split('lang="en">')[1].replace("<br>",'').replace('</p>','').replace('\xc2','').replace('\xa0','').replace("'"," ").strip()
159+
except:
160+
item['rv_dc'] = 'NULL'
161+
162+
sql = ("insert into public.y select %s,'%s',%s,%s,'%s','%s','%s',%s where not exists ( select * from public.y where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_profile'],item['rv_dc'],item['rv_rating'],item['rv_dc'],item['rv_profile']))
163+
164+
cur.execute(sql)
165+
con.commit()
166+
167+
print self.__class__.count
168+
if item['rv_count'] > self.__class__.count:
169+
170+
nxt_link = hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('utf-8').strip()
171+
172+
if nxt_link:
173+
""" Next link Processed """
174+
yield Request( nxt_link, callback=self.parse)
175+
176+
else:
177+
self.__class__.count = 0
178+
""" Reviews below 40 for this business"""
179+
180+
else:
181+
item['rv_date'] = 'NULL'
182+
item['rv_profile'] = 'NULL'
183+
item['rv_rating'] = 'NULL'
184+
item['rv_dc'] = 'NULL'
185+
186+
187+

0 commit comments

Comments
 (0)