Skip to content

Commit

Permalink
refactor all web scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
noisecode3 committed Nov 9, 2024
1 parent c1a1fc4 commit e33192a
Show file tree
Hide file tree
Showing 14 changed files with 1,552 additions and 894 deletions.
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,17 @@ if(POLICY CMP0167)
cmake_policy(SET CMP0167 NEW)
endif()
find_package(Boost REQUIRED COMPONENTS system filesystem)

find_package(OpenSSL REQUIRED)

if(NOT EXISTS "${CMAKE_SOURCE_DIR}/libs/miniz/CMakeLists.txt")
message(STATUS "Submodule 'libs/miniz' not found. Initializing submodules...")
execute_process(
COMMAND git submodule update --init --recursive
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
)
endif()

add_subdirectory(libs/miniz)

# suppress ZERO_CHECK dont think its needed
Expand Down
3 changes: 2 additions & 1 deletion database/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
data.json
file_info.json
trle
__pycache__

trle.tar.gz
File renamed without changes.
29 changes: 29 additions & 0 deletions database/get_trle.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

# Check if megadl or mega-get is installed
if command -v megadl &> /dev/null; then
downloader="megadl"
elif command -v mega-get &> /dev/null; then
downloader="mega-get"
else
echo "Neither megatools (megadl) nor megacmd (mega-get) is installed."
echo "Please install one of them to proceed."
exit 1
fi

# Define the download link
url="https://mega.nz/file/xXkV3JqJ#1Ejtd9enidYYpV3FRLO5KSzcUg7-_Jg-vNi66RKo8aI"

# Download the file using the available tool
echo "Using $downloader to download the file..."
$downloader "$url"

# Verify the checksum (assuming you want to compare it to the expected checksum)
echo "Verifying checksum..."
echo "29e7e89bc11ebe77eafbd1c78ca3f1a7 trle.tar.gz" | md5sum -c -

# Extract the tar.gz file
echo "Extracting the archive..."
tar xzf trle.tar.gz

echo "Download and extraction complete."
72 changes: 72 additions & 0 deletions database/get_trle_by_id_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
This script retrieves all TRLE levels by a specified ID range and saves the data as JSON files.
Usage: python3 get_trle_by_id_range.py FROM_ID TO_ID
"""

import sys
import time
import json

import scrape
import data_factory

def safe_string_to_int(id_str):
"""Converts a string to an integer with error checking.
Args:
s (str): The string to convert.
Returns:
int: The converted integer if valid.
Exits:
Exits with status code 1 if the input string is not a valid integer.
"""
try:
return int(id_str)
except ValueError:
print("Error: The provided string is not a valid integer.")
sys.exit(1)


def trle_by_id(trle_id):
"""Fetches TRLE level data by ID and saves it as a JSON file.
Args:
trle_id (int): The ID of the TRLE level to fetch.
"""
data = data_factory.make_trle_tombll_data()
soup = scrape.get_soup(f"https://www.trle.net/sc/levelfeatures.php?lid={trle_id}")
scrape.get_trle_level(soup, data)
if data['title']:
with open(f'trle/{trle_id}.json', mode='w', encoding='utf-8') as json_file:
json.dump(data, json_file)


if __name__ == '__main__':

if len(sys.argv) != 3:
print("Usage: python3 get_trle_by_id_range.py FROM_ID TO_ID")
sys.exit(1)

print("Please use get_trle.sh; I provide this as part of the open source license.")
if input("Continue? (y/n): ").lower() != 'y':
sys.exit(1)

# Convert arguments to integers with validation
from_id = safe_string_to_int(sys.argv[1])
to_id = safe_string_to_int(sys.argv[2])

if from_id == to_id:
trle_by_id(from_id)
sys.exit(0)

# Ensure from_id is less than to_id by swapping if necessary
if from_id > to_id:
from_id, to_id = to_id, from_id # Tuple for cleaner swapping... python...

# Fetch and save data for each level ID in the specified range
for level_id in range(from_id, to_id + 1): # Include to_id in range
print(f"Getting TRLE level by ID: {level_id}")
trle_by_id(level_id)
time.sleep(5) # To avoid rate-limiting by adding delay between requests
5 changes: 3 additions & 2 deletions database/https.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from tqdm import tqdm

import get_leaf_cert
import data_factorie
import data_factory

class AcquireLock:
"""
Create a TCP socket to ensure a single instance.
Expand Down Expand Up @@ -318,7 +319,7 @@ def download_file(self, url):
"""
curl = pycurl.Curl()
temp_cert_path = None
zip_file = data_factorie.make_zip_file() # Initialize the zip_file dictionary
zip_file = data_factory.make_zip_file() # Initialize the zip_file dictionary

try:
# Get file size for the progress bar
Expand Down
3 changes: 3 additions & 0 deletions database/ideas.txt
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,6 @@ will focus on creating an index database of around 500 MB, rather than replicati
the entire TRLE database, which could exceed 20 GB. Additional data, such as levels
of specific interest to users, can be cached or downloaded manually, within a reasonable
limit of around 2 GB.

This was a special walkthrough the script cant handle
https://www.trle.net/sc/Levelwalk.php?lid=864
20 changes: 10 additions & 10 deletions database/index_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@
import os
import time
import index_view
import index_scrape
import scrape
import index_query
import make_index_database

def test_trle():
"""Browse TRLE data"""
index_view.print_trle_page(index_scrape.get_trle_page(0, True))
index_view.print_trle_page(scrape.get_trle_page(0, True))
offset = 0
while True:
user_input = input("Press Enter for the next page (or type 'q' to quit: ")
if user_input.lower() == 'q':
print("Exiting...")
break
offset += 20
index_view.print_trle_page(index_scrape.get_trle_page(offset, True))
index_view.print_trle_page(scrape.get_trle_page(offset, True))


def test_trcustoms():
"""Browse Trcustom data"""
page = index_scrape.get_trcustoms_page(1, True)
page = scrape.get_trcustoms_page(1, True)
index_view.print_trcustoms_page(page)
offset = 1
while True:
Expand All @@ -31,7 +31,7 @@ def test_trcustoms():
print("Exiting...")
break
offset += 1
page = index_scrape.get_trcustoms_page(offset, True)
page = scrape.get_trcustoms_page(offset, True)
index_view.print_trcustoms_page(page)


Expand All @@ -58,7 +58,7 @@ def test_trcustoms_pic_local():
while True:
page = index_query.get_trcustoms_page_local(offset, True)
levels = page['levels']
covers = index_scrape.get_cover_list(levels)
covers = scrape.get_trcustoms_cover_list(levels, True)
index_view.display_menu(levels, covers)
for file in covers:
try:
Expand Down Expand Up @@ -89,7 +89,7 @@ def test_insert_trle_book():
method that is much slower but will work and accurate"""

# Get the first page to determine the total number of records
page = index_scrape.get_trle_page(0)
page = scrape.get_trle_page(0)
total_records = page['records_total']

# Insert the first page of data
Expand All @@ -101,7 +101,7 @@ def test_insert_trle_book():
offset = 20
while offset < total_records:
# Fetch the next page of data
page = index_scrape.get_trle_page(offset)
page = scrape.get_trle_page(offset)
index_query.insert_trle_page(page)

# Increment offset by 20 for the next batch
Expand All @@ -118,7 +118,7 @@ def test_insert_trle_book():
def test_insert_trcustoms_book():
"""Get index data"""
# Get the first page to determine the total number of records
page = index_scrape.get_trcustoms_page(1)
page = scrape.get_trcustoms_page(1)
total_pages = page['total_pages']

# Insert the first page of data
Expand All @@ -130,7 +130,7 @@ def test_insert_trcustoms_book():
page_number = 2
while page_number <= total_pages:
# Fetch the next page of data
page = index_scrape.get_trcustoms_page(page_number)
page = scrape.get_trcustoms_page(page_number)
index_query.insert_trcustoms_page(page)
print(f"Page number:{page_number} of {total_pages}")

Expand Down
14 changes: 7 additions & 7 deletions database/index_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
import sqlite3

import data_factorie
import data_factory
os.chdir(os.path.dirname(os.path.abspath(__file__)))


Expand Down Expand Up @@ -373,7 +373,7 @@ def get_trle_level_local_by_id(trle_id):

records = []
for record in result:
level = data_factorie.make_trle_level_data()
level = data_factory.make_trle_level_data()
level['trle_id'] = record[0]
level['author'] = record[1]
level['title'] = record[2]
Expand Down Expand Up @@ -423,7 +423,7 @@ def get_trcustoms_level_local_by_id(trcustoms_id):
""", (trcustoms_id, ), cursor
)

level = data_factorie.make_trcustoms_level_data()
level = data_factory.make_trcustoms_level_data()
level['trcustoms_id'] = result[0][0]
level['authors'] = result[0][1].split(',') if result[0][1] else []
level['title'] = result[0][2]
Expand Down Expand Up @@ -454,7 +454,7 @@ def get_trle_page_local(offset, sortCreatedFirst=False):
if offset > rec:
sys.exit(1)

page = data_factorie.make_trle_page_data()
page = data_factory.make_trle_page_data()
page['offset'] = offset
page['records_total'] = rec

Expand Down Expand Up @@ -482,7 +482,7 @@ def get_trle_page_local(offset, sortCreatedFirst=False):
)
# Process result to format the output as needed
for row in result:
level = data_factorie.make_trle_level_data()
level = data_factory.make_trle_level_data()
level['trle_id'] = row[0]
level['author'] = row[1]
level['title'] = row[2]
Expand Down Expand Up @@ -511,7 +511,7 @@ def get_trcustoms_page_local(page_number, sortCreatedFirst=False):
cursor
)[0][0]

page = data_factorie.make_trcustoms_page_data()
page = data_factory.make_trcustoms_page_data()
total = (rec + 19) // 20
if page_number > total:
sys.exit(1)
Expand Down Expand Up @@ -550,7 +550,7 @@ def get_trcustoms_page_local(page_number, sortCreatedFirst=False):
)
# Process result to format the output as needed
for row in result:
level = data_factorie.make_trcustoms_level_data()
level = data_factory.make_trcustoms_level_data()
level['trcustoms_id'] = row[0]
level['authors'] = row[1].split(',') if row[1] else []
level['title'] = row[2]
Expand Down
Loading

0 comments on commit e33192a

Please sign in to comment.