-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigbasket.py
68 lines (61 loc) · 2.96 KB
/
bigbasket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# path to input file
CSVFILE = "input.csv"
# path to chrome driver
CHROME_DRIVER_PATH = '/usr/local/bin/chromedriver'
# starting a browser
chromeOptions = Options()
chromeOptions.headless = True
options = Options()
options = webdriver.ChromeOptions()
options.binary_location = '/usr/bin/google-chrome'
service_log_path = "./chromedriver.log"
service_args = ['--verbose']
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
chrome_options=options,
service_args=service_args,
service_log_path=service_log_path)
def make_request(url):
driver.get(url) # request to specific url of item
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml') #convert the result into lxml
products = soup.find_all("div",qa="product") # getting list of products from lxml
for p in range(0,len(products)):
try:
price_tag = products[p].find("div",qa="price") # getting price tag of item from lxml
price = price_tag.find("span", class_="discnt-price").text # extract text from price tag
price = re.findall(r'[\d\.\d]+', price) # modified price text so we could get only numerical value
weight_tag = products[p].find("div",class_="col-sm-12 col-xs-7 qnty-selection") # getting weight tag of item from lxml
weight = weight_tag.find("span",class_="ng-binding").text # extract text from weight tag
weight = weight.split(" ") # split weight text so we could get numercial value and unit seperately
unit = weight[1]
weight = weight[0]
price = price[0]
break
except Exception as e:
continue
return price, weight, unit
# method that reads csv file
def read_csv():
df = pd.read_csv(CSVFILE) # read csv file and store data in a dataframe name df
product_list = df['Item Name'] # read a column with name 'Item Name' form df
for i in range(0,len(product_list)): # loop through all items
try:
product_name = product_list[i].lower().replace(" ","%20") #replace spaces in item name with '%20' so that we could use in url
url = f"https://www.bigbasket.com/ps/?q={product_name}" # url for each item name
price, weight, unit = make_request(url) # call method make_request that returns item price its weight and unit
print(i,"",product_list[i], " ", price, " ", weight, unit)
df['Total Price'][i] = price # replace price of that specific item name in df
df['Weight'][i] = weight # replace weight of that specific item name in df
df['Unit'][i] = unit # replace unit of that specific item name in df
except Exception as e:
continue
df.to_csv("output.csv", index=False) # write updated df in output csv file
read_csv()