Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: use reference service to provide assembly ID #58

Merged
merged 3 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions etc/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ GOHAN_API_IMAGE=gohan-api
GOHAN_API_VERSION=latest

GOHAN_API_BUILDER_BASE_IMAGE=golang:1.21-bookworm
GOHAN_API_DEV_BASE_IMAGE=ghcr.io/bento-platform/bento_base_image:golang-debian-2023.11.10
GOHAN_API_PROD_BASE_IMAGE=ghcr.io/bento-platform/bento_base_image:plain-debian-2023.11.10
GOHAN_API_DEV_BASE_IMAGE=ghcr.io/bento-platform/bento_base_image:golang-debian-2023.12.01
GOHAN_API_PROD_BASE_IMAGE=ghcr.io/bento-platform/bento_base_image:plain-debian-2023.12.01

GOHAN_API_CONTAINER_NAME=gohan-api
GOHAN_API_SERVICE_HOST=0.0.0.0
Expand Down
2 changes: 1 addition & 1 deletion src/api/contexts/contexts.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ type (

// Convenient storage for relevant http context data
QueryParameters struct {
AssemblyId constants.AssemblyId
AssemblyId string
Alleles []string
Chromosome string
Genotype constants.GenotypeQuery
Expand Down
10 changes: 4 additions & 6 deletions src/api/middleware/assemblyMiddleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package middleware

import (
"gohan/api/contexts"
"gohan/api/models/constants"
assid "gohan/api/models/constants/assembly-id"
"net/http"

"github.com/labstack/echo"
Expand All @@ -16,14 +14,14 @@ func MandateAssemblyIdAttribute(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
// check for assemblyId query parameter
assemblyId := c.QueryParam("assemblyId")
if len(assemblyId) == 0 || !assid.IsKnownAssemblyId(assemblyId) {
// if no id was provided, or it was invalid, return an error
return echo.NewHTTPError(http.StatusBadRequest, "Missing or unknown assemblyId!")
if len(assemblyId) == 0 {
// if no id was provided, return an error
return echo.NewHTTPError(http.StatusBadRequest, "Missing assemblyId!")
}

// forward a type-safe value down the pipeline
gc := c.(*contexts.GohanContext)
gc.AssemblyId = constants.AssemblyId(assemblyId)
gc.AssemblyId = assemblyId

return next(gc)
}
Expand Down
40 changes: 2 additions & 38 deletions src/api/models/constants/assembly-id/main.go
Original file line number Diff line number Diff line change
@@ -1,42 +1,6 @@
package assemblyId

import (
"gohan/api/models/constants"
"strings"
)

const (
Unknown constants.AssemblyId = "Unknown"

GRCh38 constants.AssemblyId = "GRCh38"
GRCh37 constants.AssemblyId = "GRCh37"
NCBI36 constants.AssemblyId = "NCBI36"
NCBI35 constants.AssemblyId = "NCBI35"
NCBI34 constants.AssemblyId = "NCBI34"
Other constants.AssemblyId = "Other"
GRCh38 string = "GRCh38"
GRCh37 string = "GRCh37"
)

func CastToAssemblyId(text string) constants.AssemblyId {
switch strings.ToLower(text) {
case "grch38":
return GRCh38
case "grch37":
return GRCh37
case "ncbi36":
return NCBI36
case "ncbi35":
return NCBI35
case "ncbi34":
return NCBI34
case "other":
return Other
default:
return Unknown
}
}

func IsKnownAssemblyId(text string) bool {
// attempt to cast to assemblyId and
// return if unknown assemblyId
return CastToAssemblyId(text) != Unknown
}
17 changes: 8 additions & 9 deletions src/api/models/dtos/main.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package dtos

import (
"gohan/api/models/constants"
"gohan/api/models/indexes"
"time"
)
Expand All @@ -21,11 +20,11 @@ type VariantCountReponse struct {
}

type VariantResult struct {
Query string `json:"query,omitempty"`
AssemblyId constants.AssemblyId `json:"assembly_id"`
Chromosome string `json:"chromosome"`
Start int `json:"start"`
End int `json:"end"`
Query string `json:"query,omitempty"`
AssemblyId string `json:"assembly_id"`
Chromosome string `json:"chromosome"`
Start int `json:"start"`
End int `json:"end"`
}

type VariantGetResult struct {
Expand Down Expand Up @@ -54,9 +53,9 @@ type VariantCall struct {
Alleles []string `json:"alleles,omitempty"`
// TODO: GenotypeProbability, PhredScaleLikelyhood ?

AssemblyId constants.AssemblyId `json:"assemblyId,omitempty"`
Dataset string `json:"dataset,omitempty"`
DocumentId string `json:"documentId,omitempty"`
AssemblyId string `json:"assemblyId,omitempty"`
Dataset string `json:"dataset,omitempty"`
DocumentId string `json:"documentId,omitempty"`
}

// --- Dataset
Expand Down
18 changes: 9 additions & 9 deletions src/api/models/indexes/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ type Variant struct {

Sample Sample `json:"sample"`

FileId string `json:"fileId"`
Dataset string `json:"dataset"`
AssemblyId c.AssemblyId `json:"assemblyId"`
CreatedTime time.Time `json:"createdTime"`
FileId string `json:"fileId"`
Dataset string `json:"dataset"`
AssemblyId string `json:"assemblyId"`
CreatedTime time.Time `json:"createdTime"`
}

type Info struct {
Expand Down Expand Up @@ -51,9 +51,9 @@ type Genotype struct {
}

type Gene struct {
Name string `json:"name"`
Chrom string `json:"chrom"`
Start int `json:"start"`
End int `json:"end"`
AssemblyId c.AssemblyId `json:"assemblyId"`
Name string `json:"name"`
Chrom string `json:"chrom"`
Start int `json:"start"`
End int `json:"end"`
AssemblyId string `json:"assemblyId"`
}
33 changes: 14 additions & 19 deletions src/api/mvc/genes/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"crypto/tls"
"fmt"
"gohan/api/contexts"
"gohan/api/models/constants"
assemblyId "gohan/api/models/constants/assembly-id"
"gohan/api/models/constants/chromosome"
"gohan/api/models/dtos"
Expand Down Expand Up @@ -51,15 +50,15 @@ func GenesIngest(c echo.Context) error {
http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
}

assemblyIdMap := map[constants.AssemblyId]string{
assemblyIdMap := map[string]string{
assemblyId.GRCh38: "gencode.v38.annotation.gtf",
assemblyId.GRCh37: "gencode.v19.annotation.gtf",
// SKIP
// assemblyId.NCBI36: "hg18",
// assemblyId.NCBI35: "hg17",
// assemblyId.NCBI34: "hg16",
}
assemblyIdGTFUrlMap := map[constants.AssemblyId]string{
assemblyIdGTFUrlMap := map[string]string{
assemblyId.GRCh38: "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz",
assemblyId.GRCh37: "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz",
// SKIP
Expand All @@ -79,7 +78,7 @@ func GenesIngest(c echo.Context) error {
CreatedAt: fmt.Sprintf("%v", time.Now()),
}

go func(_assId constants.AssemblyId, _fileName string, _assemblyWg *sync.WaitGroup, reqStat *ingest.GeneIngestRequest) {
go func(_asmId string, _fileName string, _assemblyWg *sync.WaitGroup, reqStat *ingest.GeneIngestRequest) {
defer _assemblyWg.Done()

var (
Expand All @@ -89,7 +88,7 @@ func GenesIngest(c echo.Context) error {
gtfFile, err := os.Open(fmt.Sprintf("%s/%s", gtfPath, _fileName))
if err != nil {
// Download the file
fullURLFile := assemblyIdGTFUrlMap[_assId]
fullURLFile := assemblyIdGTFUrlMap[_asmId]

handleHardErr := func(err error) {
msg := "Something went wrong: " + err.Error()
Expand Down Expand Up @@ -193,13 +192,13 @@ func GenesIngest(c echo.Context) error {
defer gtfFile.Close()

// clean out genes currently in elasticsearch by assembly id
fmt.Printf("Cleaning out %s gene documents from genes index (if any)\n", string(_assId))
esRepo.DeleteGenesByAssemblyId(cfg, es7Client, _assId)
fmt.Printf("Cleaning out %s gene documents from genes index (if any)\n", string(_asmId))
esRepo.DeleteGenesByAssemblyId(cfg, es7Client, _asmId)

fileScanner := bufio.NewScanner(gtfFile)
fileScanner.Split(bufio.ScanLines)

fmt.Printf("Ingesting %s\n", string(_assId))
fmt.Printf("Ingesting %s\n", string(_asmId))
reqStat.State = ingest.Running
iz.GeneIngestRequestChan <- reqStat

Expand All @@ -222,7 +221,7 @@ func GenesIngest(c echo.Context) error {
go func(rowText string, _chromHeaderKey int,
_startKey int, _endKey int,
_nameHeaderKeys []int, _geneNameHeaderKeys []int,
_assId constants.AssemblyId,
_assId string,
_gwg *sync.WaitGroup) {
// fmt.Printf("row : %s\n", row)

Expand Down Expand Up @@ -276,19 +275,19 @@ func GenesIngest(c echo.Context) error {
Chrom: chromosomeClean,
Start: start,
End: end,
AssemblyId: _assId,
AssemblyId: _asmId,
}

iz.GeneIngestionBulkIndexingQueue <- &structs.GeneIngestionQueueStructure{
Gene: discoveredGene,
WaitGroup: _gwg,
}
}(rowText, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, _assId, &geneWg)
}(rowText, chromHeaderKey, startKey, endKey, nameHeaderKeys, geneNameHeaderKeys, _asmId, &geneWg)
}

geneWg.Wait()

fmt.Printf("%s ingestion done!\n", _assId)
fmt.Printf("%s ingestion done!\n", _asmId)
fmt.Printf("Deleting %s\n", unzippedFileName)
err = os.Remove(fmt.Sprintf("%s/%s", gtfPath, unzippedFileName))
if err != nil {
Expand Down Expand Up @@ -335,11 +334,7 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error {
// Assembly ID
// perform wildcard search if empty/random parameter is passed
// - set to Unknown to trigger it
var assId constants.AssemblyId
if gc.AssemblyId != assemblyId.Unknown {
// retrieve passed parameter if is valid
assId = gc.AssemblyId
}
asmId := gc.AssemblyId

// Size
var (
Expand All @@ -354,10 +349,10 @@ func GenesGetByNomenclatureWildcard(c echo.Context) error {
}
}

fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s (max size: %d)\n", term, assId, size)
fmt.Printf("Executing wildcard genes search for term %s, assemblyId %s (max size: %d)\n", term, asmId, size)

// Execute
docs, geneErr := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, chromosomeSearchTerm, term, assId, size)
docs, geneErr := esRepo.GetGeneDocumentsByTermWildcard(cfg, es, chromosomeSearchTerm, term, asmId, size)
if geneErr != nil {
return c.JSON(http.StatusOK, map[string]interface{}{
"status": 500,
Expand Down
9 changes: 2 additions & 7 deletions src/api/mvc/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package mvc
import (
"gohan/api/contexts"
"gohan/api/models/constants"
a "gohan/api/models/constants/assembly-id"
gq "gohan/api/models/constants/genotype-query"
"strings"

Expand All @@ -12,7 +11,7 @@ import (
"github.com/labstack/echo"
)

func RetrieveCommonElements(c echo.Context) (*elasticsearch.Client, string, int, int, string, string, []string, constants.GenotypeQuery, constants.AssemblyId, string) {
func RetrieveCommonElements(c echo.Context) (*elasticsearch.Client, string, int, int, string, string, []string, constants.GenotypeQuery, string, string) {
gc := c.(*contexts.GohanContext)
es := gc.Es7Client

Expand Down Expand Up @@ -48,11 +47,7 @@ func RetrieveCommonElements(c echo.Context) (*elasticsearch.Client, string, int,
}
}

assemblyId := a.Unknown
assemblyIdQP := c.QueryParam("assemblyId")
if len(assemblyIdQP) > 0 && a.IsKnownAssemblyId(assemblyIdQP) {
assemblyId = a.CastToAssemblyId(assemblyIdQP)
}
assemblyId := c.QueryParam("assemblyId")

return es, chromosome, lowerBound, upperBound, reference, alternative, alleles, genotype, assemblyId, datasetString
}
14 changes: 6 additions & 8 deletions src/api/repositories/elasticsearch/genes.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ import (
"time"

"gohan/api/models"
"gohan/api/models/constants"
assemblyId "gohan/api/models/constants/assembly-id"
"gohan/api/utils"

"github.com/elastic/go-elasticsearch/v7"
Expand Down Expand Up @@ -106,7 +104,7 @@ func GetGeneBucketsByKeyword(cfg *models.Config, es *elasticsearch.Client) (map[
}

func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client,
chromosomeSearchTerm string, term string, assId constants.AssemblyId, size int) (map[string]interface{}, error) {
chromosomeSearchTerm string, term string, asmId string, size int) (map[string]interface{}, error) {

if cfg.Debug {
http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
Expand All @@ -115,10 +113,10 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client
// Nomenclature Search Term
nomenclatureStringTerm := fmt.Sprintf("*%s*", term)

// Assembly Id Search Term (wildcard by default)
// Assembly ID Search Term (wildcard by default)
assemblyIdStringTerm := "*"
if assId != assemblyId.Unknown {
assemblyIdStringTerm = string(assId)
if asmId != "" {
assemblyIdStringTerm = asmId
}

var buf bytes.Buffer
Expand Down Expand Up @@ -217,7 +215,7 @@ func GetGeneDocumentsByTermWildcard(cfg *models.Config, es *elasticsearch.Client
return result, nil
}

func DeleteGenesByAssemblyId(cfg *models.Config, es *elasticsearch.Client, assId constants.AssemblyId) (map[string]interface{}, error) {
func DeleteGenesByAssemblyId(cfg *models.Config, es *elasticsearch.Client, asmId string) (map[string]interface{}, error) {

if cfg.Debug {
http.DefaultTransport.(*http.Transport).TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
Expand All @@ -227,7 +225,7 @@ func DeleteGenesByAssemblyId(cfg *models.Config, es *elasticsearch.Client, assId
query := map[string]interface{}{
"query": map[string]interface{}{
"match": map[string]interface{}{
"assemblyId": string(assId),
"assemblyId": asmId,
},
},
}
Expand Down
7 changes: 3 additions & 4 deletions src/api/repositories/elasticsearch/variants.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (

"gohan/api/models"
c "gohan/api/models/constants"
a "gohan/api/models/constants/assembly-id"
gq "gohan/api/models/constants/genotype-query"
s "gohan/api/models/constants/sort"
z "gohan/api/models/constants/zygosity"
Expand Down Expand Up @@ -110,7 +109,7 @@ func GetDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config, e
reference string, alternative string, alleles []string,
size int, sortByPosition c.SortDirection,
includeInfoInResultSet bool,
genotype c.GenotypeQuery, assemblyId c.AssemblyId,
genotype c.GenotypeQuery, assemblyId string,
getSampleIdsOnly bool) (map[string]interface{}, error) {

// begin building the request body.
Expand Down Expand Up @@ -405,7 +404,7 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config,
chromosome string, lowerBound int, upperBound int,
variantId string, sampleId string, datasetString string,
reference string, alternative string, alleles []string,
genotype c.GenotypeQuery, assemblyId c.AssemblyId) (map[string]interface{}, error) {
genotype c.GenotypeQuery, assemblyId string) (map[string]interface{}, error) {

// begin building the request body.
mustMap := []map[string]interface{}{{
Expand Down Expand Up @@ -465,7 +464,7 @@ func CountDocumentsContainerVariantOrSampleIdInPositionRange(cfg *models.Config,
}})
}

if assemblyId != "" && assemblyId != a.Unknown {
if assemblyId != "" {
mustMap = append(mustMap, map[string]interface{}{
"match": map[string]interface{}{
"assemblyId": map[string]interface{}{
Expand Down
Loading