-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_get_occurrences.jl
135 lines (111 loc) · 4.37 KB
/
01_get_occurrences.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#### Preparing the query for the GBIF download API ###
# Adapted from the code I wrote in https://github.com/graciellehigino/ms_range_interactions/blob/master/code/A3_gbif_query.jl
include("A0_required.jl")
## 1. GBIF Query ###
# Create the query to the GBIF download API to retrieve the species occurrences.
# Note that this is the exact code we used to create our query on October 21st
# 2022. Re-run this 1st part only if you want to create a new query for yourself
# as it will be different from ours as soon as new species observations are
# added.
# In contrast, the 2nd part of this script will always download the exact same
# set of occurrences generated by our own query.
# Create bounding box
left, right, bottom, top = (-175.0, -45.0, 10.0, 90.0)
# Load species list from reconciled metaweb
mw_path = joinpath("data", "input", "canadian_thresholded_reconciled.csv")
mw_output = DataFrame(CSV.File(mw_path; stringtype=String))
mammals = union(mw_output.from, mw_output.to)
sort!(mammals)
# Get taxa codes
sp_taxon = taxon.(mammals)
sp_names = [t.species.first for t in sp_taxon]
sp_codes = [t.species.second for t in sp_taxon]
unique!(sp_codes)
# Add taxa codes and bounding box to the JSON query
query = """
{
"creator": "username",
"notification_address": [
"useremail"
],
"sendNotification": true,
"format": "SIMPLE_CSV",
"predicate": {
"type": "and",
"predicates": [
{
"type": "in",
"key": "TAXON_KEY",
"values": $(sp_codes)
},
{
"type":"equals",
"key":"HAS_COORDINATE",
"value":"true"
},
{
"type":"within",
"geometry":"POLYGON(($left $top,
$left $bottom, $right $bottom,
$right $top,$left $top))"
}
]}
}
"""
ispath("xtras") || mkpath("xtras")
open(joinpath("xtras", "query.json"), "w") do io
println(io, query)
end
# Next edit the query file with your GBIF username and email
# Then add your username and password to the following curl command
# and run it in a terminal
# curl --include --user username:password --header "Content-Type: application/json" --data @xtras/query.json https://api.gbif.org/v1/occurrence/download/request
# This will send the request to GBIF. You will then receive an email with a
# download link (probably within minutes)
# ⚠️ DO NOT COMMIT THE FILES WITH YOUR USERNAME, EMAIL, OR PASSWORD ⚠️
## 2. Download dataset ####
# The following parts of this script can be re-run at any time and will always
# download the exact same set of occurrences generated by our own query.
# Download & extract dataset if absent
gbif_path = joinpath("xtras", "gbif")
csv_file = joinpath(gbif_path, "gbif_occurrences.csv")
zip_file = joinpath(gbif_path, "gbif_occurrences.zip")
ispath(gbif_path) || mkpath(gbif_path)
if !isfile(csv_file)
if !isfile(zip_file)
@info "Downloading data from GBIF (206 MB)"
url = "https://api.gbif.org/v1/occurrence/download/request/0111374-220831081235567.zip"
Downloads.download(url, zip_file)
end
@info "Extracting data (1.1 GB)"
zf = ZipFile.Reader(zip_file)
write(csv_file, read(zf.files[1]))
close(zf)
rm(zip_file)
end
## 3. Select the columns of interest ###
# Load the dataset
cols = [:occurrenceID, :species, :decimalLongitude, :decimalLatitude]
occ_df = CSV.read(joinpath(csv_file), DataFrame; delim="\t", quoted=false, select=cols)
# Note that quoted=false is absolutely necessary to avoid a bug while reading.
# After manual verification, the data is not quoted.
# However, some elements in the occurrenceID column start with a ", which is the
# quote character. E.g. "AFEW-DSCN0025
# Make sure we have the number of occurrences as expected
nrow(occ_df) == 1_894_342
# Make sure the species names match
isequal(sort(unique(occ_df.species)), mammals) # same
# Select the columns of interest
select!(occ_df, [:species, :decimalLongitude, :decimalLatitude])
# Rename columns
rename!(occ_df, [:name, :longitude, :latitude])
# Split by species and export
occ_path = joinpath("data", "occurrences")
ispath(occ_path) || mkpath(occ_path)
for sp in mammals
sp_df = filter(:name => ==(sp), occ_df)
select!(sp_df, Not(:name))
sp = replace(sp, " " => "_")
CSV.write(joinpath(occ_path, "$sp.csv"), sp_df)
end
# CSV.write(joinpath("data", "clean", "gbif_occurrences.csv"), occ_df)