01_get_occurrences.jl

#### Preparing the query for the GBIF download API ###

# Adapted from the code I wrote in https://github.com/graciellehigino/ms_range_interactions/blob/master/code/A3_gbif_query.jl

include("A0_required.jl")

## 1. GBIF Query ###

# Create the query to the GBIF download API to retrieve the species occurrences.
# Note that this is the exact code we used to create our query on October 21st
# 2022. Re-run this 1st part only if you want to create a new query for yourself
# as it will be different from ours as soon as new species observations are
# added.

# In contrast, the 2nd part of this script will always download the exact same
# set of occurrences generated by our own query.

# Create bounding box
left, right, bottom, top = (-175.0, -45.0, 10.0, 90.0)

# Load species list from reconciled metaweb
mw_path = joinpath("data", "input", "canadian_thresholded_reconciled.csv")
mw_output = DataFrame(CSV.File(mw_path; stringtype=String))
mammals = union(mw_output.from, mw_output.to)
sort!(mammals)

# Get taxa codes
sp_taxon = taxon.(mammals)
sp_names = [t.species.first for t in sp_taxon]
sp_codes = [t.species.second for t in sp_taxon]
unique!(sp_codes)

# Add taxa codes and bounding box to the JSON query
query = """
{
    "creator": "username",
    "notification_address": [
        "useremail"
    ],
    "sendNotification": true,
    "format": "SIMPLE_CSV",
    "predicate": {
        "type": "and",
        "predicates": [
        {
            "type": "in",
            "key": "TAXON_KEY",
            "values": $(sp_codes)
        },
        {
            "type":"equals",
            "key":"HAS_COORDINATE",
            "value":"true"
        },
        {
            "type":"within",
            "geometry":"POLYGON(($left $top,
                      $left $bottom, $right $bottom,
                      $right $top,$left $top))"
        }
    ]}
}
"""
ispath("xtras") || mkpath("xtras")
open(joinpath("xtras", "query.json"), "w") do io
    println(io, query)
end

# Next edit the query file with your GBIF username and email
# Then add your username and password to the following curl command
# and run it in a terminal

# curl --include --user username:password --header "Content-Type: application/json" --data @xtras/query.json https://api.gbif.org/v1/occurrence/download/request

# This will send the request to GBIF. You will then receive an email with a
# download link (probably within minutes)

# ⚠️ DO NOT COMMIT THE FILES WITH YOUR USERNAME, EMAIL, OR PASSWORD ⚠️

## 2. Download dataset ####

# The following parts of this script can be re-run at any time and will always
# download the exact same set of occurrences generated by our own query.

# Download & extract dataset if absent
gbif_path = joinpath("xtras", "gbif")
csv_file = joinpath(gbif_path, "gbif_occurrences.csv")
zip_file = joinpath(gbif_path, "gbif_occurrences.zip")
ispath(gbif_path) || mkpath(gbif_path)
if !isfile(csv_file)
    if !isfile(zip_file)
        @info "Downloading data from GBIF (206 MB)"
        url = "https://api.gbif.org/v1/occurrence/download/request/0111374-220831081235567.zip"
        Downloads.download(url, zip_file)
    end
    @info "Extracting data (1.1 GB)"
    zf = ZipFile.Reader(zip_file)
    write(csv_file, read(zf.files[1]))
    close(zf)
    rm(zip_file)
end

## 3. Select the columns of interest ###

# Load the dataset
cols = [:occurrenceID, :species, :decimalLongitude, :decimalLatitude]
occ_df = CSV.read(joinpath(csv_file), DataFrame; delim="\t", quoted=false, select=cols)

# Note that quoted=false is absolutely necessary to avoid a bug while reading.
# After manual verification, the data is not quoted.
# However, some elements in the occurrenceID column start with a ", which is the
# quote character. E.g. "AFEW-DSCN0025

# Make sure we have the number of occurrences as expected
nrow(occ_df) == 1_894_342

# Make sure the species names match
isequal(sort(unique(occ_df.species)), mammals) # same

# Select the columns of interest
select!(occ_df, [:species, :decimalLongitude, :decimalLatitude])

# Rename columns
rename!(occ_df, [:name, :longitude, :latitude])

# Split by species and export
occ_path = joinpath("data", "occurrences")
ispath(occ_path) || mkpath(occ_path)
for sp in mammals
    sp_df = filter(:name => ==(sp), occ_df)
    select!(sp_df, Not(:name))
    sp = replace(sp, " " => "_")
    CSV.write(joinpath(occ_path, "$sp.csv"), sp_df)
end
# CSV.write(joinpath("data", "clean", "gbif_occurrences.csv"), occ_df)