diff --git a/database/https.py b/database/https.py index e75f664..aded656 100644 --- a/database/https.py +++ b/database/https.py @@ -216,7 +216,14 @@ def get_response(self, url, content_type): def pack_response_buffer(self, content_type, response_buffer): """Validate and return the response based on content type""" if content_type == 'text/html': - return response_buffer.getvalue().decode('utf-8', errors='ignore') + raw_data = response_buffer.getvalue() + for encoding in ['utf-8', 'windows-1252', 'utf-16', 'utf-32']: + try: + return raw_data.decode(encoding) + except UnicodeDecodeError: + continue + logging.error("No known encoding") + sys.exit(1) if content_type == 'application/json': return json.loads(response_buffer.getvalue().decode('utf-8')) if content_type in ['image/jpeg', 'image/png']: @@ -357,7 +364,6 @@ def download_file(self, url): return {} # Return an empty dict on error self.status = 0 - print("Downloaded successfully.") # Finalize MD5 checksum md5_hash = hashlib.md5(usedforsecurity=False) @@ -413,7 +419,3 @@ def release_lock(): def is_locked(): """Lock this instance""" ACQUIRE_LOCK.is_locked() - - -#if __name__ == '__main__': -# print(get("https://www.trle.net/scadm/trle_dl.php?lid=3667", 'application/zip')) diff --git a/database/ideas.txt b/database/ideas.txt index 708315b..0d92a36 100644 --- a/database/ideas.txt +++ b/database/ideas.txt @@ -233,3 +233,22 @@ class Downloader: Never forget how we can test one function in python: python3 -c "from index_scrape import get_trle_page; print(get_trle_page(0, True))" + +Tracking TRLE Records Efficiently + +For TRCustoms, tracking and indexing records is streamlined and fast through JSON. + +However, tracking records on TRLE is more complex. One approach could involve scanning +the latest pages and identifying any gaps in ID numbers. These gaps may indicate +deleted records, though it’s likely not necessary to handle them differently. However, +if a new record’s ID deviates significantly from the last known increment—for instance, +going from IDs 666, 667, and 668 to a much lower number like 345—then records +with these IDs should be checked for updates if they already exist in our database. + +A full resync of TRLE data will be infrequent, as it would require around 3,700 requests, +potentially taking 1-2 days—something not feasible for regular users. To keep our +index database relevant and manageable, a yearly refresh should suffice. This approach +will focus on creating an index database of around 500 MB, rather than replicating +the entire TRLE database, which could exceed 20 GB. Additional data, such as levels +of specific interest to users, can be cached or downloaded manually, within a reasonable +limit of around 2 GB. diff --git a/database/index_scrape.py b/database/index_scrape.py index 7757675..7f5b91a 100644 --- a/database/index_scrape.py +++ b/database/index_scrape.py @@ -353,26 +353,3 @@ def get_key(id_number): print("Serial Number:", serial_number) - #return validate_downloaded_key(id_number, serial_number) - - -''' -# Create a temporary file to hold the combined certificate -with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_cert_file: - # Write all certificates into the temporary file - for cert in cert_list: - temp_cert_file.write(cert) - - # Store the name of the temporary file - temp_cert_filename = temp_cert_file.name - -# Now use this temporary file with requests for SSL verification -response = requests.get(url, verify=temp_cert_filename, timeout=5) - -# Once done, you can clean up the temporary file -import os -os.remove(temp_cert_filename) -''' - -#ab:a9:b5:e7:a4:8c:f3:fc:5a:73:da:16:04:36:03:20 -#https://crt.sh/?serial=ab%3Aa9%3Ab5%3Ae7%3Aa4%3A8c%3Af3%3Afc%3A5a%3A73%3Ada%3A16%3A04%3A36%3A03%3A20 diff --git a/database/parser.py b/database/parser.py new file mode 100644 index 0000000..1c0d4ae --- /dev/null +++ b/database/parser.py @@ -0,0 +1,21 @@ +import re +# Replace custom tags with HTML spans and apply classes +# https://raw.githubusercontent.com/rr-/TRCustoms/develop/frontend/src/components/markdown-composer/MarkdownButtons/index.tsx +def custom_markdown_parser(text): + text = re.sub(r'\[o\](.*?)\[/o\]', r'\1', text) # blue text for objects + text = re.sub(r'\[s\](.*?)\[/s\]', r'\1', text) # secret styling + text = re.sub(r'\[p\](.*?)\[/p\]', r'\1', text) # pickup styling + text = re.sub(r'\[e\](.*?)\[/e\]', r'\1', text) # enemy styling + text = re.sub(r'\[t\](.*?)\[/t\]', r'\1', text) # trap styling + text = re.sub(r'\[center\](.*?)\[/center\]', r'