Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
sal-uva committed Oct 15, 2024
2 parents e556c2d + cbbf89e commit 4eb0917
Show file tree
Hide file tree
Showing 28 changed files with 941 additions and 283 deletions.
36 changes: 29 additions & 7 deletions .github/workflows/docker_pr_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,34 @@ jobs:
name: Test docker-compose up with build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Run docker compose up
run: docker compose -f docker-compose_build.yml up -d
- name: Wait and check log
- name: Check backend container is running
run: |
sleep 30
if [ "$(docker ps | grep 4cat_backend)" ]; then
echo "Docker 4cat_backend container is running..."
else
echo -e "Docker 4cat_backend container is not running...\nPrinting 4cat_backend logs:\n\n$(docker container logs 4cat_backend)"
exit 1
fi
- name: Check frontend container is running
run: |
sleep 10
if [ "$(docker ps | grep 4cat_frontend)" ]; then
echo "Docker 4cat_frontend container is running..."
else
echo -e "Docker 4cat_frontend container is not running...\nPrinting 4cat_frontend logs:\n\n$(docker container logs 4cat_frontend)"
exit 1
fi
- name: Check 4CAT backend log for expected INFO message
run: |
test_case=" INFO at api.py:65: Local API listening for requests at backend:4444"
sleep 30 && var=$(docker exec 4cat_backend tail -n 1 logs/backend_4cat.log)
echo "::group::Backend test"
if [ "$(echo "$var" | tr "|" "\n" | sed -n '2p')" = "$test_case" ]; then
echo "Backend running as expected"
echo "4CAT backend running as expected"
else
echo "::error::Backend failed to start"
echo "Test:$test_case"
Expand All @@ -32,7 +50,11 @@ jobs:
- name: Print log on failure
if: failure()
run: |
docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
echo "::group::Backend logs"
cat backend_4cat.log
echo "::endgroup::"
if [ "$(docker ps | grep 4cat)" ]; then
docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
echo "::group::Backend logs"
cat backend_4cat.log
echo "::endgroup::"
else
echo "Docker containers not running; check logs in previous steps"
fi
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
[![Requires Python 3.8](https://img.shields.io/badge/py-v3.8-blue)](https://www.python.org/)
[![Docker image status](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml/badge.svg)](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml)

<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
<p align="center"><img alt="A screenshot of 4CAT, displaying its 'Create Dataset' interface" src="common/assets/screenshot1.png"><img alt="A screenshot of 4CAT, displaying a network visualisation of a dataset" src="common/assets/screenshot2.png"></p>

<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
<p align="center"><a href="https://bsky.app/profile/4cat.nl">Follow 4CAT on Bluesky</a> for updates.</p>
4CAT is a research tool that can be used to analyse and process data from
online social platforms. Its goal is to make the capture and analysis of data
from these platforms accessible to people through a web interface, without
Expand Down
52 changes: 37 additions & 15 deletions backend/workers/cleanup_tempfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""
import shutil
import re

import json
from datetime import datetime
from pathlib import Path

from common.config_manager import config
Expand All @@ -27,12 +28,21 @@ class TempFileCleaner(BasicWorker):

ensure_job = {"remote_id": "localhost", "interval": 10800}

# Use tracking file to delay deletion of files that may still be in use
tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner")
days_to_keep = 7

def work(self):
"""
Go through result files, and for each one check if it should still
exist
:return:
"""
# Load tracking file
if not self.tracking_file.exists():
tracked_files = {}
else:
tracked_files = json.loads(self.tracking_file.read_text())

result_files = Path(config.get('PATH_DATA')).glob("*")
for file in result_files:
Expand All @@ -41,6 +51,7 @@ def work(self):
continue

if self.interrupted:
self.tracking_file.write_text(json.dumps(tracked_files))
raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")

# the key of the dataset files belong to can be extracted from the
Expand All @@ -59,20 +70,28 @@ def work(self):
except DataSetException:
# the dataset has been deleted since, but the result file still
# exists - should be safe to clean up
self.log.info("No matching dataset with key %s for file %s, deleting file" % (key, str(file)))
if file.is_dir():
try:
shutil.rmtree(file)
except PermissionError:
self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
f"permissions), skipping")

else:
try:
file.unlink()
except FileNotFoundError:
# the file has been deleted since
pass
if file.name not in tracked_files:
self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
elif tracked_files[file.name] < datetime.now().timestamp():
self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
if file.is_dir():
try:
shutil.rmtree(file)
except PermissionError:
self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
f"permissions), skipping")

else:
try:
file.unlink()
except FileNotFoundError:
# the file has been deleted since
pass

# Remove from tracking
del tracked_files[file.name]

continue

if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
Expand All @@ -84,4 +103,7 @@ def work(self):
dataset.key, str(file)))
shutil.rmtree(file)

# Update tracked files
self.tracking_file.write_text(json.dumps(tracked_files))

self.job.finish()
10 changes: 0 additions & 10 deletions common/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,6 @@ def ensure_database(self):
"""
self.with_db()

# delete unknown keys
known_keys = tuple([names for names, settings in config.config_definition.items() if settings.get("type") not in UserInput.OPTIONS_COSMETIC])
unknown_keys = self.db.fetchall("SELECT DISTINCT name FROM settings WHERE name NOT IN %s", (known_keys,))

if unknown_keys:
self.db.log.info(f"Deleting unknown settings from database: {', '.join([key['name'] for key in unknown_keys])}")
self.db.delete("settings", where={"name": tuple([key["name"] for key in unknown_keys])}, commit=False)

self.db.commit()

# create global values for known keys with the default
known_settings = self.get_all()
for setting, parameters in self.config_definition.items():
Expand Down
16 changes: 15 additions & 1 deletion common/lib/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from common.config_manager import config
from common.lib.job import Job, JobNotFoundException
from common.lib.module_loader import ModuleCollector
from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version
from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
from common.lib.fourcat_module import FourcatModule
from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
Expand Down Expand Up @@ -1586,6 +1586,20 @@ def get_media_type(self):
# Default to text
return self.parameters.get("media_type", "text")

def get_metadata(self):
"""
Get dataset metadata
This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended
as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to
update its database (and ensure compatibility with the exporting version of 4CAT).
"""
metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,))

# get 4CAT version (presumably to ensure export is compatible with import)
metadata["current_4CAT_version"] = get_software_version()
return metadata

def get_result_url(self):
"""
Gets the 4CAT frontend URL of a dataset file.
Expand Down
2 changes: 1 addition & 1 deletion datasources/douyin/search_douyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def map_item(item):
"timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
"post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
# Adding this as different Douyin pages contain different data
"post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}",
"post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
"region": item.get("region", ""),
"hashtags": ",".join(
[tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
Expand Down
Loading

0 comments on commit 4eb0917

Please sign in to comment.