From 1cd0c31d05c3c1fcfc1faf7d7c17e8f7941077e4 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Thu, 6 Jun 2024 16:46:55 -0400 Subject: [PATCH] HGNC ROBOT template - Rename: mondo_genes.csv --> mondo_genes.robot.tsv --- .github/workflows/buid_and_release.yml | 2 +- .gitignore | 2 +- README.md | 17 ++++-- makefile | 14 ++++- run.sh | 85 ++++++++++++++++++++++++++ 5 files changed, 111 insertions(+), 9 deletions(-) create mode 100644 run.sh diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 747c4d1..c2a4638 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -40,4 +40,4 @@ jobs: files: | omim.owl omim.sssom.tsv - mondo_genes.csv + mondo_genes.robot.tsv diff --git a/.gitignore b/.gitignore index 00a176d..ae2ebb0 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,4 @@ omim.json mondo_exactmatch_omim.sssom.tsv mondo_exactmatch_omimps.sssom.tsv omim.owl -mondo_genes.csv +mondo_genes.robot.tsv diff --git a/README.md b/README.md index 48f105d..05bbaae 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OMIM stands for "Online Mendelian Inheritance in Man", and is an online catalog of human genes and genetic disorders. The official site is: https://omim.org/ This purpose of this repository is for data transformations for ingest into Mondo. Mainly, -it is for generating an `omim.ttl` file. +it is for generating an `omim.ttl` and other release artefacts. Disclaimer: This repository and its created data artefacts are unnofficial. For official, up-to-date OMIM data, please visit [omim.org](https://omim.org). @@ -31,10 +31,10 @@ you get an error related to this when installing, ignore it, as it is does not seem to be needed to run any of the tools. If however you do get a `psutil` error when running anything, please let us know by [creating an issue](https://github.com/monarch-initiative/omim/issues/new). -## Running & creating `omim.ttl` -Run: `make all` +## Running & creating release +Run: `sh run.sh make all` -Running this will create a new `omim.ttl` file in the root directory. +Running this will create new release artefacts in the root directory. You can also run `make build` or `python -m omim2obo`. These are all the same command. This will download files from omim.org and run the build. @@ -44,8 +44,11 @@ If there's an issue downloading the files, or you are offline, or you just want to use the cache anyway, you can pass the `--use-cache` flag. ## Additional tools +
Details +

+ ### Get PMIDs used for OMIM codes from `omim.ttl` -Command: `make get-pmids` +Command: `sh run.sh make get-pmids` ### OMIM Code Web Scraper Currently, the only feature is `get_codes_by_yyyy_mm`, which returns a list of @@ -86,3 +89,7 @@ from omim2obo.omim_code_scraper import get_codes_by_yyyy_mm code_tuples = get_codes_by_yyyy_mm('2021/05') ``` + + +

+
diff --git a/makefile b/makefile index 11f3af7..ef8f94b 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ # MAIN COMMANDS / GOALS ------------------------------------------------------------------------------------------------ -all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.csv +all: omim.ttl omim.sssom.tsv omim.owl mondo_genes.robot.tsv # build: Create new omim.ttl omim.ttl: @@ -35,8 +35,18 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom query --update sparql/hgnc_links.ru \ convert -f ofn -o $@ -mondo_genes.csv: omim.owl +mondo_genes.robot.tsv: omim.owl + # Create a TSV of relational information for gene and disease classes robot query -i omim.owl --query sparql/mondo_genes.sparql $@ + # Insert the source_code column as the second to last column + awk 'BEGIN {FS=OFS="\t"} {if (NR==1) {$$(NF+1)=$$(NF); $$(NF-1)="?source_code";} else {$$(NF+1)=$$(NF); $$(NF-1)="MONDO:OMIM";}} 1' $@ > temp_file && mv temp_file $@ + # Remove the first character of each field in the header + awk 'BEGIN {FS=OFS="\t"} NR==1 {for (i=1; i<=NF; i++) $$i=substr($$i, 2)} {print}' $@ > temp_file && mv temp_file $@ + # Remove < and > characters from specified columns + awk 'BEGIN {FS=OFS="\t"} NR>1 {gsub(/^<|>$$/, "", $$1); gsub(/^<|>$$/, "", $$2); gsub(/^<|>$$/, "", $$5)} {print}' $@ > temp_file && mv temp_file $@ + # Insert ROBOT subheader + robot_subheader="ID\tSC 'has material basis in germline mutation in' some %\t>A oboInOwl:source\t>A oboInOwl:source\t" && \ + sed 1a"$$robot_subheader" $@ > temp_file && mv temp_file $@ cleanup: @rm -f omim.json diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..1f6db72 --- /dev/null +++ b/run.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# Wrapper script for docker. +# +# This is used primarily for wrapping the GNU Make workflow. +# Instead of typing "make TARGET", type "./run.sh make TARGET". +# This will run the make workflow within a docker container. +# +# The assumption is that you are working in the src/ontology folder; +# we therefore map the whole repo (../..) to a docker volume. +# +# To use singularity instead of docker, please issue +# export USE_SINGULARITY= +# before running this script. +# +# See README-editors.md for more details. + +if [ -f run.sh.conf ]; then + . ./run.sh.conf +fi + +# Look for a GitHub token +if [ -n "$GH_TOKEN" ]; then + : +elif [ -f ../../.github/token.txt ]; then + GH_TOKEN=$(cat ../../.github/token.txt) +elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then + GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token) +elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then + GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token") +fi + +ODK_IMAGE=${ODK_IMAGE:-odkfull} +TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }') +if [ -n "$TAG_IN_IMAGE" ]; then + # Override ODK_TAG env var if IMAGE already includes a tag + ODK_TAG=$TAG_IN_IMAGE + ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }') +fi +ODK_TAG=${ODK_TAG:-v1.4.3} +ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G} +ODK_DEBUG=${ODK_DEBUG:-no} + +# Convert OWLAPI_* environment variables to the OWLAPI as Java options +# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html +# for a list of allowed options +OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions +for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do + ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}" +done + +TIMECMD= +if [ x$ODK_DEBUG = xyes ]; then + # If you wish to change the format string, take care of using + # non-breaking spaces (U+00A0) instead of normal spaces, to + # prevent the shell from tokenizing the format string. + echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps." + TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb" +fi + +VOLUME_BIND=$PWD:/work +WORK_DIR=/work + +if [ -n "$ODK_BINDS" ]; then + VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS" +fi + +if [ -n "$USE_SINGULARITY" ]; then + + singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \ + --env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \ + --bind $VOLUME_BIND \ + -W $WORK_DIR \ + docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +else + BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')" + docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \ + -e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \ + --rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +fi + +case "$@" in +*update_repo*|*release*) + echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/." + ;; +esac \ No newline at end of file