-
Notifications
You must be signed in to change notification settings - Fork 4
/
pullingTheDB.py
71 lines (58 loc) · 2.55 KB
/
pullingTheDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import numpy as np
import requests
import random
from bs4 import BeautifulSoup
import sys
import codecs
sys.stdout.reconfigure(encoding='utf-8')
if __name__ == '__main__':
df = pd.read_csv("Categories/TheCatogories.csv", encoding="utf-8")
for index, row in df.iterrows():
field = row['Field_ID']
subfield = row["SubField_ID"]
# TODO: CurrPage needs to increase by 1 until end
random_number = random.uniform(0, 1)
print(random_number)
parameters = {
"PageID": 112,
"CurrPage": 1,
"spec": field,
"spec1": subfield,
"searchoption": "And",
"rand": random_number
}
r = requests.get(
"https://www.oea.org.lb/Arabic/GetMembers.aspx", params=parameters)
response = r.text
soup = BeautifulSoup(response, 'html.parser')
# print(response)
engineer_IDs = soup.find_all(class_="date")
arabic_names = soup.find_all(class_="company")
latin_names = soup.find_all(class_="field")
links = soup.find_all(class_="more")
data = {"Engineer_ID": engineer_IDs,
"Arabic_Names": arabic_names,
"Latin_Names": latin_names,
"Links": links
}
sample = pd.DataFrame(data=data)
print(sample.info())
sample["Engineer_ID"] = sample["Engineer_ID"].astype(str).str.replace('<div class="date"><b>رقم المهندس: </b>', '')
sample["Engineer_ID"] = sample["Engineer_ID"].str.replace('</div>', '')
sample["Arabic_Names"] = sample["Arabic_Names"].astype(str).str.replace('<div class="company"><b>الاسم: </b>', '')
sample["Arabic_Names"] = sample["Arabic_Names"].str.replace('</div>', '')
sample["Latin_Names"] = sample["Latin_Names"].astype(str).str.replace('<div class="field"><b>Latin Name: </b>', '')
sample["Latin_Names"] = sample["Latin_Names"].str.replace('</div>', '')
sample["Links"] = sample["Links"].astype(str).str.replace('<div class="more"><a href="', '')
sample["Links"] = sample["Links"].str.replace('">التفاصيل</a></div>', '')
sample["Field_ID"]=field
sample["SubField_ID"]=subfield
sample["Field"]=row["Field"]
sample["SubField"]=row["SubField"]
print(sample.info())
sample.to_csv("Data/sample.csv", index=False)
# TODO: Call until no more
# TODO: merge all the dfs into and print into a subfiled and filed csv
# We need the remove the added divs and info
break