Skip to content

Commit

Permalink
Merge branch 'master' into scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Oct 8, 2024
2 parents 5ecfbd2 + db5b649 commit f0536e2
Show file tree
Hide file tree
Showing 224 changed files with 32,722 additions and 25,236 deletions.
2 changes: 1 addition & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ data/
.github/
.ipynb_checkpoints/
.gitignore
.idea/
.idea/
3 changes: 1 addition & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ TELEGRAM_PORT=443
# Docker Volume Names
DOCKER_DB_VOL=4cat_4cat_db
DOCKER_DATA_VOL=4cat_4cat_data
DOCKER_CONFIG_VOL=4cat_4cat_share
DOCKER_CONFIG_VOL=4cat_4cat_config
DOCKER_LOGS_VOL=4cat_4cat_logs

# Gunicorn settings
Expand All @@ -39,4 +39,3 @@ workers=4
threads=4
worker_class=gthread
log_level=debug

36 changes: 29 additions & 7 deletions .github/workflows/docker_pr_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,34 @@ jobs:
name: Test docker-compose up with build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Run docker compose up
run: docker compose -f docker-compose_build.yml up -d
- name: Wait and check log
- name: Check backend container is running
run: |
sleep 30
if [ "$(docker ps | grep 4cat_backend)" ]; then
echo "Docker 4cat_backend container is running..."
else
echo -e "Docker 4cat_backend container is not running...\nPrinting 4cat_backend logs:\n\n$(docker container logs 4cat_backend)"
exit 1
fi
- name: Check frontend container is running
run: |
sleep 10
if [ "$(docker ps | grep 4cat_frontend)" ]; then
echo "Docker 4cat_frontend container is running..."
else
echo -e "Docker 4cat_frontend container is not running...\nPrinting 4cat_frontend logs:\n\n$(docker container logs 4cat_frontend)"
exit 1
fi
- name: Check 4CAT backend log for expected INFO message
run: |
test_case=" INFO at api.py:65: Local API listening for requests at backend:4444"
sleep 30 && var=$(docker exec 4cat_backend tail -n 1 logs/backend_4cat.log)
echo "::group::Backend test"
if [ "$(echo "$var" | tr "|" "\n" | sed -n '2p')" = "$test_case" ]; then
echo "Backend running as expected"
echo "4CAT backend running as expected"
else
echo "::error::Backend failed to start"
echo "Test:$test_case"
Expand All @@ -32,7 +50,11 @@ jobs:
- name: Print log on failure
if: failure()
run: |
docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
echo "::group::Backend logs"
cat backend_4cat.log
echo "::endgroup::"
if [ "$(docker ps | grep 4cat)" ]; then
docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
echo "::group::Backend logs"
cat backend_4cat.log
echo "::endgroup::"
else
echo "Docker containers not running; check logs in previous steps"
fi
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ venv/

# generated by 4CAT
webtool/static/css/colours.css
webtool/static/img/favicon/favicon.ico

# data files
results
Expand Down
2 changes: 1 addition & 1 deletion .zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"license": "MPL-2.0",
"title": "4CAT Capture and Analysis Toolkit",
"upload_type": "software",
"version": "v1.39",
"version": "v1.46",
"keywords": [
"webmining",
"scraping",
Expand Down
7 changes: 4 additions & 3 deletions 4cat-daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
cli = argparse.ArgumentParser()
cli.add_argument("--interactive", "-i", default=False, help="Run 4CAT in interactive mode (not in the background).",
action="store_true")
cli.add_argument("--log-level", "-l", default="INFO", help="Set log level (\"DEBUG\", \"INFO\", \"WARNING\", \"ERROR\", \"CRITICAL\", \"FATAL\").")
cli.add_argument("--no-version-check", "-n", default=False,
help="Skip version check that may prompt the user to migrate first.", action="store_true")
cli.add_argument("command")
Expand Down Expand Up @@ -81,14 +82,14 @@
print("Running backend in interactive mode instead.")
import backend.bootstrap as bootstrap

bootstrap.run(as_daemon=False)
bootstrap.run(as_daemon=False, log_level=args.log_level)
sys.exit(0)

if args.interactive:
print("Running backend in interactive mode.")
import backend.bootstrap as bootstrap

bootstrap.run(as_daemon=False)
bootstrap.run(as_daemon=False, log_level=args.log_level)
sys.exit(0)
else:
# if so, import necessary modules
Expand Down Expand Up @@ -129,7 +130,7 @@ def start():
detach_process=True
) as context:
import backend.bootstrap as bootstrap
bootstrap.run(as_daemon=True)
bootstrap.run(as_daemon=True, log_level=args.log_level)

sys.exit(0)

Expand Down
64 changes: 63 additions & 1 deletion LICENSE-3DPARTY
Original file line number Diff line number Diff line change
Expand Up @@ -802,4 +802,66 @@ Incorporates the Graphology graph manipulation library
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
THE SOFTWARE.

-------------------------------------------------------------------------------
Incorporates the zip.js library
- at /webtool/static/js/zip.min.js
- from https://github.com/gildas-lormeau/zip.js

BSD 3-Clause License

Copyright (c) 2023, Gildas Lormeau

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-------------------------------------------------------------------------------
Incorporates the bsky-embed library
- at /webtool/static/js/bsky-embed.es.js
- from https://github.com/Vincenius/bsky-embed

MIT License

Copyright (c) 2024 Vincent Will (Vincenius)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

41 changes: 25 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
[![Requires Python 3.8](https://img.shields.io/badge/py-v3.8-blue)](https://www.python.org/)
[![Docker image status](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml/badge.svg)](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml)

<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
<p align="center"><img alt="A screenshot of 4CAT, displaying its 'Create Dataset' interface" src="common/assets/screenshot1.png"><img alt="A screenshot of 4CAT, displaying a network visualisation of a dataset" src="common/assets/screenshot2.png"></p>

<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
<p align="center"><a href="https://bsky.app/profile/4cat.nl">Follow 4CAT on Bluesky</a> for updates.</p>
4CAT is a research tool that can be used to analyse and process data from
online social platforms. Its goal is to make the capture and analysis of data
from these platforms accessible to people through a web interface, without
Expand All @@ -26,39 +27,47 @@ such as the generation and visualisation of word embedding models.
platforms that are part of the tool, but you can also [add additional data
sources](https://github.com/digitalmethodsinitiative/4cat/wiki/How-to-make-a-data-source)
using 4CAT's Python API. The following data sources are currently supported
actively and can be used to collect data with 4CAT:
actively and can be used to collect data with 4CAT directly:

* 4chan and 8kun
* BitChute
* Reddit
* Telegram
* Tumblr
* Twitter API v2 (Academic and regular tracks)

The following platforms are supported through other tools, with which you can
collect data to import data into 4CAT for analysis:
The following platforms are supported through
[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer), with
which you can collect data to import into 4CAT for analysis:

* Instagram (posts)
* TikTok (posts and comments)
* 9gag
* Imgur
* LinkedIn
* Gab
* Douyin
* X/Twitter

* Instagram, TikTok, 9gag, Imgur, LinkedIn, Parler, Douyin and Twitter (via
[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer))
* Facebook and Instagram (via [CrowdTangle](https://www.crowdtangle.com) exports)
It is also possible to upload data collected with other tools as CSV files. The
following tools are explicitly supported but other data can also be uploaded as
long as it is formatted as CSV:

* Facebook and Instagram (via [CrowdTangle](https://www.crowdtangle.com) or [Facepager](https://github.com/strohne/Facepager) exports)
* YouTube videos and comments (via the [YouTube Data Tools](https://ytdt.digitalmethods.net/))
* Weibo (via [Bazhuayu](https://www.bazhuayu.com/))

A number of other platforms have built-in support that is untested, or requires
e.g. special API access. You can view the [data sources in our wiki](https://github.com/digitalmethodsinitiative/4cat/wiki/Available-data-sources) or review [the data
sources' code](https://github.com/digitalmethodsinitiative/4cat/tree/master/datasources)
in the GitHub repository. It is also possible to import your own CSV files into
4CAT for analysis.
in the GitHub repository.

## Installation
You can install 4CAT locally or on a server via Docker or manually. For easiest installation, we reccomend copying our [`docker-compose.yml file`](https://raw.githubusercontent.com/digitalmethodsinitiative/4cat/master/docker-compose.yml), [`.env`](https://raw.githubusercontent.com/digitalmethodsinitiative/4cat/master/.env) file, and running this terminal command in the folder where those files have been saved:
You can install 4CAT locally or on a server via Docker or manually. For easiest installation, we recommend copying our [`docker-compose.yml file`](https://raw.githubusercontent.com/digitalmethodsinitiative/4cat/master/docker-compose.yml), [`.env`](https://raw.githubusercontent.com/digitalmethodsinitiative/4cat/master/.env) file, and running this terminal command in the folder where those files have been saved:

```
docker-compose up -d
```

In depth instructions on both Docker installation and manual installation can be found [in our
wiki](https://github.com/digitalmethodsinitiative/4cat/wiki/Installing-4CAT).

A video walkthrough installing 4CAT via Docker can be found on [YouTube here](https://youtu.be/oWsB7bvNfOY).
wiki](https://github.com/digitalmethodsinitiative/4cat/wiki/Installing-4CAT). A video walkthrough installing 4CAT via Docker can be found on [YouTube here](https://youtu.be/oWsB7bvNfOY).

Currently scraping of 4chan, 8chan, and 8kun require additional steps; please see the wiki.

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
1.41
1.46

This file should not be modified. It is used by 4CAT to determine whether it
needs to run migration scripts to e.g. update the database structure to a more
Expand Down
6 changes: 0 additions & 6 deletions backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
import sys
import os

from common.lib.module_loader import ModuleCollector

# load modules
all_modules = ModuleCollector()

# add 4CAT root as import path
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + "/..")
6 changes: 3 additions & 3 deletions backend/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from common.config_manager import config

def run(as_daemon=True):
def run(as_daemon=True, log_level="INFO"):
pidfile = Path(config.get('PATH_ROOT'), config.get('PATH_LOCKFILE'), "4cat.pid")

if as_daemon:
Expand Down Expand Up @@ -49,9 +49,9 @@ def run(as_daemon=True):
if config.get("USING_DOCKER"):
as_daemon = True
# Rename log if Docker setup
log = Logger(output=True, filename='backend_4cat.log')
log = Logger(output=True, filename='backend_4cat.log', log_level=log_level)
else:
log = Logger(output=not as_daemon)
log = Logger(output=not as_daemon, filename='4cat.log', log_level=log_level)

log.info("4CAT Backend started, logger initialised")
db = Database(logger=log, appname="main",
Expand Down
6 changes: 5 additions & 1 deletion backend/database.sql
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS datasets (
is_private boolean DEFAULT TRUE,
software_version text,
software_file text DEFAULT '',
software_source text DEFAULT '',
annotation_fields text DEFAULT ''
);

Expand All @@ -80,9 +81,12 @@ CREATE TABLE IF NOT EXISTS metrics (
datasource text,
board text,
date text,
count integer
count BIGINT
);

CREATE UNIQUE INDEX IF NOT EXISTS unique_metrics
ON metrics (metric, datasource, board, date);

-- users
CREATE TABLE IF NOT EXISTS users (
name TEXT UNIQUE PRIMARY KEY,
Expand Down
Loading

0 comments on commit f0536e2

Please sign in to comment.