diff --git a/cmd/commands/datasetIngestor.go b/cmd/commands/datasetIngestor.go index e6850be..5851826 100644 --- a/cmd/commands/datasetIngestor.go +++ b/cmd/commands/datasetIngestor.go @@ -105,30 +105,28 @@ For Windows you need instead to specify -user username:password on the command l } metadatafile := args[0] - filelistingPath := "" - folderlistingPath := "" + datasetFileListTxt := "" + folderListingTxt := "" absFileListing := "" if len(args) == 2 { - if args[1] == "folderlisting.txt" { - folderlistingPath = args[1] + argFileName := filepath.Base(args[1]) + if argFileName == "folderlisting.txt" { + // NOTE folderListingTxt is a TEXT FILE that lists dataset folders that should all be ingested together + // WITH the same metadata EXCEPT for the sourceFolder path (which is set during ingestion) + folderListingTxt = args[1] } else { - // NOTE filelistingPath is some kind of path to which the sourceFolder path should be relative - filelistingPath = args[1] - absFileListing, _ = filepath.Abs(filelistingPath) + // NOTE datasetFileListTxt is a TEXT FILE that lists the files & folders of a dataset (contained in a folder) + // that should be considered as "part of" the dataset. The paths must be relative to the sourceFolder. + datasetFileListTxt = args[1] + absFileListing, _ = filepath.Abs(datasetFileListTxt) } } if datasetUtils.TestArgs != nil { - datasetUtils.TestArgs([]interface{}{metadatafile, filelistingPath, folderlistingPath}) + datasetUtils.TestArgs([]interface{}{metadatafile, datasetFileListTxt, folderListingTxt}) return } - // functions use this flag in a way where "nil -> unset" - var allowExistingSourceFolderPtr *bool = &allowExistingSourceFolder - if !noninteractiveFlag && !cmd.Flags().Lookup("allowexistingsource").Changed { - allowExistingSourceFolderPtr = nil - } - if showVersion { fmt.Printf("%s\n", VERSION) return @@ -170,48 +168,70 @@ For Windows you need instead to specify -user username:password on the command l /* TODO Add info about policy settings and that autoarchive will take place or not */ - metaDataMap, metaSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups) + metaDataMap, metadataSourceFolder, beamlineAccount, err := datasetIngestor.CheckMetadata(client, APIServer, metadatafile, user, accessGroups) if err != nil { log.Fatal("Error in CheckMetadata function: ", err) } //log.Printf("metadata object: %v\n", metaDataMap) - // assemble list of datasetFolders (=datasets) to be created - var datasetFolders []string - if folderlistingPath == "" { - datasetFolders = append(datasetFolders, metaSourceFolder) + // assemble list of datasetPaths (=datasets) to be created + var datasetPaths []string + if folderListingTxt == "" { + datasetPaths = append(datasetPaths, metadataSourceFolder) } else { // get folders from file - folderlist, err := os.ReadFile(folderlistingPath) + folderlist, err := os.ReadFile(folderListingTxt) if err != nil { log.Fatal(err) } lines := strings.Split(string(folderlist), "\n") // remove all empty and comment lines for _, line := range lines { - if line != "" && string(line[0]) != "#" { - // NOTE what is this special third level "data" folder that needs to be unsymlinked? - // convert into canonical form only for certain online data linked from eaccounts home directories - var parts = strings.Split(line, "/") - if len(parts) > 3 && parts[3] == "data" { - realSourceFolder, err := filepath.EvalSymlinks(line) - if err != nil { - log.Fatalf("Failed to find canonical form of sourceFolder:%v %v", line, err) - } - color.Set(color.FgYellow) - log.Printf("Transform sourceFolder %v to canonical form: %v", line, realSourceFolder) - color.Unset() - datasetFolders = append(datasetFolders, realSourceFolder) - } else { - datasetFolders = append(datasetFolders, line) + if line == "" || string(line[0]) == "#" { + continue + } + // NOTE what is this special third level "data" folder that needs to be unsymlinked? + // convert into canonical form only for certain online data linked from eaccounts home directories + var parts = strings.Split(line, "/") + if len(parts) > 3 && parts[3] == "data" { + realSourceFolder, err := filepath.EvalSymlinks(line) + if err != nil { + log.Fatalf("Failed to find canonical form of sourceFolder:%v %v\n", line, err) } + color.Set(color.FgYellow) + log.Printf("Transform sourceFolder %v to canonical form: %v\n", line, realSourceFolder) + color.Unset() + datasetPaths = append(datasetPaths, realSourceFolder) + } else { + datasetPaths = append(datasetPaths, line) } } } // log.Printf("Selected folders: %v\n", folders) // test if a sourceFolder already used in the past and give warning - datasetIngestor.TestForExistingSourceFolder(datasetFolders, client, APIServer, user["accessToken"], allowExistingSourceFolderPtr) + foundList, err := datasetIngestor.TestForExistingSourceFolder(datasetPaths, client, APIServer, user["accessToken"]) + if err != nil { + log.Fatal(err) + } + color.Set(color.FgYellow) + fmt.Println("Warning! The following datasets have been found with the same sourceFolder: ") + for _, element := range foundList { + fmt.Printf(" - PID: \"%s\", sourceFolder: \"%s\"\n", element.Pid, element.SourceFolder) + } + color.Unset() + if !allowExistingSourceFolder && len(foundList) > 0 { + if cmd.Flags().Changed("allowexistingsource") { + log.Printf("Do you want to ingest the corresponding new datasets nevertheless (y/N) ? ") + scanner.Scan() + archiveAgain := scanner.Text() + if archiveAgain != "y" { + log.Fatalln("Aborted.") + } + } else { + log.Fatalln("Existing sourceFolders are not allowed. Aborted.") + } + } // TODO ask archive system if sourcefolder is known to them. If yes no copy needed, otherwise // a destination location is defined by the archive system @@ -220,7 +240,7 @@ For Windows you need instead to specify -user username:password on the command l // now everything is prepared, start to loop over all folders var skip = "" // check if skip flag is globally defined via flags: - if cmd.Flags().Lookup("linkfiles").Changed { + if cmd.Flags().Changed("linkfiles") { switch linkfiles { case "delete": skip = "sA" @@ -232,17 +252,20 @@ For Windows you need instead to specify -user username:password on the command l } var datasetList []string - for _, sourceFolder := range datasetFolders { + for _, datasetSourceFolder := range datasetPaths { // ignore empty lines - if sourceFolder == "" { + if datasetSourceFolder == "" { // NOTE if there are empty source folder(s), shouldn't we raise an error? continue } - metaDataMap["sourceFolder"] = sourceFolder - log.Printf("Scanning files in dataset %s", sourceFolder) + metaDataMap["sourceFolder"] = datasetSourceFolder + log.Printf("Scanning files in dataset %s", datasetSourceFolder) - fullFileArray, startTime, endTime, owner, numFiles, totalSize := - datasetIngestor.AssembleFilelisting(sourceFolder, filelistingPath, &skip) + fullFileArray, startTime, endTime, owner, numFiles, totalSize, err := + datasetIngestor.GetLocalFileList(datasetSourceFolder, datasetFileListTxt, &skip) + if err != nil { + log.Fatalf("Can't gather the filelist of \"%s\"", datasetSourceFolder) + } //log.Printf("full fileListing: %v\n Start and end time: %s %s\n ", fullFileArray, startTime, endTime) log.Printf("The dataset contains %v files with a total size of %v bytes.", numFiles, totalSize) @@ -268,10 +291,10 @@ For Windows you need instead to specify -user username:password on the command l // and unless copy flag defined via command line if !copyFlag && !nocopyFlag { // NOTE this whole copyFlag, nocopyFlag ordeal makes no sense whatsoever if !beamlineAccount { - err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, sourceFolder) + err := datasetIngestor.CheckDataCentrallyAvailable(user["username"], RSYNCServer, datasetSourceFolder) if err != nil { color.Set(color.FgYellow) - log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", sourceFolder) + log.Printf("The source folder %v is not centrally available (decentral use case).\nThe data must first be copied to a rsync cache server.\n ", datasetSourceFolder) color.Unset() copyFlag = true // check if user account @@ -323,7 +346,7 @@ For Windows you need instead to specify -user username:password on the command l datasetIngestor.AddAttachment(client, APIServer, datasetId, metaDataMap, user["accessToken"], addAttachment, addCaption) } if copyFlag { - err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, sourceFolder, absFileListing) + err := datasetIngestor.SyncDataToFileserver(datasetId, user, RSYNCServer, datasetSourceFolder, absFileListing) if err == nil { // delayed enabling archivable = true @@ -402,4 +425,5 @@ func init() { datasetIngestorCmd.Flags().String("addcaption", "", "Optional caption to be stored with attachment (single dataset case only)") datasetIngestorCmd.MarkFlagsMutuallyExclusive("testenv", "devenv", "localenv", "tunnelenv") + //datasetIngestorCmd.MarkFlagsMutuallyExclusive("nocopy", "copy") } diff --git a/datasetIngestor/assembleFilelisting.go b/datasetIngestor/assembleFilelisting.go index 3fc89e5..74c8238 100644 --- a/datasetIngestor/assembleFilelisting.go +++ b/datasetIngestor/assembleFilelisting.go @@ -26,7 +26,9 @@ type Datafile struct { var skippedLinks = 0 var illegalFileNames = 0 var errorGroupIds = 0 + const windows = "windows" + var scanner = bufio.NewScanner(os.Stdin) // readLines reads a whole file into memory @@ -47,7 +49,7 @@ func readLines(path string) ([]string, error) { } /* -AssembleFilelisting scans a source folder and optionally a file listing, and returns a list of data files, the earliest and latest modification times, the owner, the number of files, and the total size of the files. +GetLocalFileList scans a source folder and optionally a file listing, and returns a list of data files, the earliest and latest modification times, the owner, the number of files, and the total size of the files. Parameters: - sourceFolder: The path to the source folder to scan. @@ -68,18 +70,17 @@ Returns: The function logs an error and returns if it cannot change the working directory to the source folder. */ -func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *string) (fullFileArray []Datafile, startTime time.Time, endTime time.Time, owner string, numFiles int64, totalSize int64) { +func GetLocalFileList(sourceFolder string, filelistingPath string, skip *string) (fullFileArray []Datafile, startTime time.Time, endTime time.Time, owner string, numFiles int64, totalSize int64, err error) { // scan all lines //fmt.Println("sourceFolder,listing:", sourceFolder, filelistingPath) fullFileArray = make([]Datafile, 0) - startTime = time.Date(2100, 1, 1, 12, 0, 0, 0, time.UTC) + startTime = time.Date(2500, 1, 1, 12, 0, 0, 0, time.UTC) endTime = time.Date(2000, 1, 1, 12, 0, 0, 0, time.UTC) owner = "" numFiles = 0 totalSize = 0 var lines []string - var err error if filelistingPath == "" { log.Printf("No explicit filelistingPath defined - full folder %s is used.\n", sourceFolder) @@ -97,24 +98,29 @@ func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *stri // TODO verify that filelisting have no overlap, e.g. no lines X/ and X/Y, // because the latter is already contained in X/ - // restore cwd after function - cwd, _ := os.Getwd() - defer os.Chdir(cwd) + // restore oldWorkDir after function + oldWorkDir, err := os.Getwd() + if err != nil { + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err + } + + defer os.Chdir(oldWorkDir) // for windows source path add colon in the leading drive character // windowsSource := strings.Replace(sourceFolder, "/C/", "C:/", 1) - osSource := sourceFolder if runtime.GOOS == windows { re := regexp.MustCompile(`^\/([A-Z])\/`) - osSource = re.ReplaceAllString(sourceFolder, "$1:/") + sourceFolder = re.ReplaceAllString(sourceFolder, "$1:/") } - if err := os.Chdir(osSource); err != nil { + if err := os.Chdir(sourceFolder); err != nil { log.Printf("Can not step into sourceFolder %v - dataset will be ignored.\n", sourceFolder) - return fullFileArray, startTime, endTime, owner, numFiles, totalSize - } else { - dir, _ := os.Getwd() - log.Printf("Scanning source folder: %s at %s", sourceFolder, dir) + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err + } + dir, err := os.Getwd() + if err != nil { + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err } + log.Printf("Scanning source folder: %s at %s", sourceFolder, dir) // reinitaialize *skip variable unless valid for All Datasets @@ -126,15 +132,9 @@ func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *stri // spin.Color("green") for _, line := range lines { - // log.Printf("Inside lines loop:%s\n", line) if len(line) == 0 { continue } - // do not printout the default "./" case - // if len(line) > 2 { - // log.Printf("Line: %s %d\n", line, len(line)) - //} - // if folder make recursive ls // spin.Start() // Start the spinner e := filepath.Walk(line, func(path string, f os.FileInfo, err error) error { @@ -146,41 +146,49 @@ func AssembleFilelisting(sourceFolder string, filelistingPath string, skip *stri if f.IsDir() && f.Name() == "." { return nil } + // extract OS dependent owner IDs and translate to names - if err == nil { - uidName, gidName := GetFileOwner(f) - // replace backslashes for windows path - modpath := path - if runtime.GOOS == windows { - modpath = strings.Replace(path, "\\", "/", -1) - } - fileStruct := Datafile{Path: modpath, User: uidName, Group: gidName, Perm: f.Mode().String(), Size: f.Size(), Time: f.ModTime().Format(time.RFC3339)} - keep := true - if f.Mode()&os.ModeSymlink == os.ModeSymlink { - pointee, _ := os.Readlink(modpath) // just pass the file name - if !filepath.IsAbs(pointee) { - dir, err := os.Getwd() - // log.Printf(" CWD path pointee :%v %v %v", dir, filepath.Dir(path), pointee) - pabs := filepath.Join(dir, filepath.Dir(modpath), pointee) - pointee, err = filepath.EvalSymlinks(pabs) - if err != nil { - log.Printf("Could not follow symlink for file:%v %v", pabs, err) - keep = false - log.Printf("keep variable set to %v", keep) - } + if err != nil { + // stop function if err given by Walk is not nil + return err + } + uidName, gidName := GetFileOwner(f) + // replace backslashes for windows path + modpath := path + if runtime.GOOS == windows { + modpath = strings.Replace(path, "\\", "/", -1) + } + fileStruct := Datafile{Path: modpath, User: uidName, Group: gidName, Perm: f.Mode().String(), Size: f.Size(), Time: f.ModTime().Format(time.RFC3339)} + keep := true + + // * handle symlinks * + if f.Mode()&os.ModeSymlink != 0 { + pointee, _ := os.Readlink(modpath) // just pass the file name + if !filepath.IsAbs(pointee) { + dir, err := os.Getwd() + if err != nil { + return err } - //fmt.Printf("Skip variable:%v\n", *skip) - if *skip == "ka" || *skip == "kA" { - keep = true - } else if *skip == "sa" || *skip == "sA" { + pabs := filepath.Join(dir, filepath.Dir(modpath), pointee) + pointee, err = filepath.EvalSymlinks(pabs) + if err != nil { + log.Printf("Could not follow symlink for file:%v %v", pabs, err) keep = false - } else if *skip == "da" || *skip == "dA" { - keep = strings.HasPrefix(pointee, sourceFolder) - } else { - color.Set(color.FgYellow) - log.Printf("Warning: the file %s is a link pointing to %v.", modpath, pointee) - color.Unset() - log.Printf(` + log.Printf("keep variable set to %v", keep) + } + } + //fmt.Printf("Skip variable:%v\n", *skip) + if *skip == "ka" || *skip == "kA" { + keep = true + } else if *skip == "sa" || *skip == "sA" { + keep = false + } else if *skip == "da" || *skip == "dA" { + keep = strings.HasPrefix(pointee, sourceFolder) + } else { + color.Set(color.FgYellow) + log.Printf("Warning: the file %s is a link pointing to %v.", modpath, pointee) + color.Unset() + log.Printf(` Please test if this link is meaningful and not pointing outside the sourceFolder %s. The default behaviour is to keep only internal links within a source folder. @@ -189,70 +197,67 @@ subsequent links within the current dataset, by appending an a (dA,ka,sa). If you want to give the same answer even to all subsequent datasets in this command then specify a capital 'A', e.g. (dA,kA,sA) Do you want to keep the link in dataset or skip it (D(efault)/k(eep)/s(kip) ?`, sourceFolder) - scanner.Scan() - *skip = scanner.Text() - if *skip == "" { - *skip = "d" - } - if *skip == "d" || *skip == "dA" { - keep = strings.HasPrefix(pointee, sourceFolder) - } else { - keep = (*skip != "s" && *skip != "sa" && *skip != "sA") - } + scanner.Scan() + *skip = scanner.Text() + if *skip == "" { + *skip = "d" } - if keep { - color.Set(color.FgGreen) - log.Printf("You chose to keep the link %v -> %v.\n\n", modpath, pointee) + if *skip == "d" || *skip == "dA" { + keep = strings.HasPrefix(pointee, sourceFolder) } else { - color.Set(color.FgRed) - skippedLinks++ - log.Printf("You chose to remove the link %v -> %v.\n\n", modpath, pointee) + keep = (*skip != "s" && *skip != "sa" && *skip != "sA") } - color.Unset() } - - // make sure that filenames do not contain characters like "\" or "*" - if strings.ContainsAny(modpath, "*\\") { + if keep { + color.Set(color.FgGreen) + log.Printf("You chose to keep the link %v -> %v.\n\n", modpath, pointee) + } else { color.Set(color.FgRed) - log.Printf("Warning: the file %s contains illegal characters like *,\\ and will not be archived.", modpath) - color.Unset() - illegalFileNames++ - keep = false + skippedLinks++ + log.Printf("You chose to remove the link %v -> %v.\n\n", modpath, pointee) } - // and check for triple blanks, they are used to separate columns in messages - if keep && strings.Contains(modpath, " ") { - color.Set(color.FgRed) - log.Printf("Warning: the file %s contains 3 consecutive blanks which is not allowed. The file not be archived.", modpath) - color.Unset() - illegalFileNames++ - keep = false + color.Unset() + } + + // * filter invalid filenames * + // make sure that filenames do not contain characters like "\" or "*" + if strings.ContainsAny(modpath, "*\\") { + color.Set(color.FgRed) + log.Printf("Warning: the file %s contains illegal characters like *,\\ and will not be archived.", modpath) + color.Unset() + illegalFileNames++ + keep = false + } + // and check for triple blanks, they are used to separate columns in messages + if keep && strings.Contains(modpath, " ") { + color.Set(color.FgRed) + log.Printf("Warning: the file %s contains 3 consecutive blanks which is not allowed. The file not be archived.", modpath) + color.Unset() + illegalFileNames++ + keep = false + } + if keep { + numFiles++ + totalSize += f.Size() + //fmt.Println(numFiles, totalSize) + //fullFileArray = append(fullFileArray, fileline) + fullFileArray = append(fullFileArray, fileStruct) + // find out earlist creation time + modTime := f.ModTime() + //fmt.Printf("FileTime:", modTime) + diff := modTime.Sub(startTime) + if diff < (time.Duration(0) * time.Second) { + startTime = modTime + // fmt.Printf("Earliest Time:%v\n", startTime) } - if keep { - numFiles++ - totalSize += f.Size() - //fmt.Println(numFiles, totalSize) - //fullFileArray = append(fullFileArray, fileline) - fullFileArray = append(fullFileArray, fileStruct) - // find out earlist creation time - modTime := f.ModTime() - //fmt.Printf("FileTime:", modTime) - diff := modTime.Sub(startTime) - if diff < (time.Duration(0) * time.Second) { - startTime = modTime - // fmt.Printf("Earliest Time:%v\n", startTime) - } - diff = modTime.Sub(endTime) - if diff > (time.Duration(0) * time.Second) { - endTime = modTime - //fmt.Printf("Last Time:%v\n", endTime) - } - owner = gidName + diff = modTime.Sub(endTime) + if diff > (time.Duration(0) * time.Second) { + endTime = modTime + //fmt.Printf("Last Time:%v\n", endTime) } - - //log.Println("Path:",modpath) - } else { - log.Println("Error:", err) + owner = gidName } + return err }) @@ -261,7 +266,7 @@ Do you want to keep the link in dataset or skip it (D(efault)/k(eep)/s(kip) ?`, } } // spin.Stop() - return fullFileArray, startTime, endTime, owner, numFiles, totalSize + return fullFileArray, startTime, endTime, owner, numFiles, totalSize, err } func PrintFileInfos() { diff --git a/datasetIngestor/assembleFilelisting_test.go b/datasetIngestor/assembleFilelisting_test.go index 748493b..e9d2d75 100644 --- a/datasetIngestor/assembleFilelisting_test.go +++ b/datasetIngestor/assembleFilelisting_test.go @@ -14,18 +14,21 @@ func TestAssembleFilelisting(t *testing.T) { t.Fatalf("Failed to create temp directory: %s", err) } defer os.RemoveAll(tempDir) - + // Create a file in the temporary directory fileName := "testfile" filePath := filepath.Join(tempDir, fileName) if err := os.WriteFile(filePath, []byte("test"), 0644); err != nil { t.Fatalf("Failed to create test file: %s", err) } - + // Call AssembleFilelisting on the temporary directory skip := "" - fullFileArray, startTime, endTime, _, numFiles, totalSize := AssembleFilelisting(tempDir, "", &skip) - + fullFileArray, startTime, endTime, _, numFiles, totalSize, err := GetLocalFileList(tempDir, "", &skip) + if err != nil { + t.Errorf("got error: %v", err) + } + // Check that the returned file array contains the correct file if len(fullFileArray) != 1 { t.Fatalf("Expected 1 file, got %d", len(fullFileArray)) @@ -43,10 +46,10 @@ func TestAssembleFilelisting(t *testing.T) { if time.Since(fileTime) > time.Second { t.Errorf("Expected file time within 1 second of now") } - + // Check the other outputs of AssembleFilelisting if time.Since(startTime) > time.Second { - t.Errorf("Expected start time within 1 second of now") + t.Errorf("Expected start time within 1 second of now - res: %s", startTime) } if time.Since(endTime) > time.Second { t.Errorf("Expected end time within 1 second of now") diff --git a/datasetIngestor/testForExistingSourceFolder.go b/datasetIngestor/testForExistingSourceFolder.go index d1d7aa5..72f8b3e 100644 --- a/datasetIngestor/testForExistingSourceFolder.go +++ b/datasetIngestor/testForExistingSourceFolder.go @@ -4,23 +4,20 @@ import ( "encoding/json" "fmt" "io" - "log" "net/http" "strings" - - "github.com/fatih/color" ) -/* Check if sourceFolders have already been used by existing datasets and give warning +/* + Check if sourceFolders have already been used by existing datasets and give warning The idea is to send ONE query which tests all datasets in one go, (but see chunking need below) -The filter condition can be defined within the header instead of the url +# The filter condition can be defined within the header instead of the url The filter size limit is server dependent: typically 8kB for the header, for URL length 2 kB (IE) Both limits may well be exceeed e.g. for 400 datasets Therefore split query into many chunks if too many folders are used in one job - */ type DatasetInfo struct { Pid string `json:"pid"` @@ -28,7 +25,7 @@ type DatasetInfo struct { Size int `json:"size"` } -type QueryResult []DatasetInfo +type DatasetQuery []DatasetInfo /* TestForExistingSourceFolder checks if the provided source folders already exist on the API server. @@ -42,38 +39,34 @@ Parameters: The function splits the folders into chunks of 100 and sends a GET request to the API server for each chunk. If a source folder already exists, the function logs a warning and asks the user if they want to continue. If the user chooses not to continue, the function stops the process and logs an error message. */ -func TestForExistingSourceFolder(folders []string, client *http.Client, APIServer string, accessToken string, allowExistingSourceFolder *bool) { +func TestForExistingSourceFolder(folders []string, client *http.Client, APIServer string, accessToken string) (foundList DatasetQuery, err error) { // Split into chunks of 100 sourceFolders const chunkSize = 100 all := len(folders) chunks := (all-1)/chunkSize + 1 url := APIServer + "/Datasets?access_token=" + accessToken - - if allowExistingSourceFolder == nil || !(*allowExistingSourceFolder) { - for i := 0; i < chunks; i++ { - start := i * chunkSize - end := start + chunkSize - if end > all { - end = all - } - log.Printf("Checking sourceFolder %v to %v for existing entries...\n", start+1, end) - - sourceFolderList := strings.Join(folders[start:end], "\",\"") - filter := createFilter(sourceFolderList) - resp := makeRequest(client, url, filter) - respObj := processResponse(resp) - - if len(respObj) > 0 { - var item DatasetInfo - for _, item = range respObj { - log.Printf("Folder: %v, size: %v, PID: %v", item.SourceFolder, item.Size, item.Pid) - } - if !confirmIngestion(allowExistingSourceFolder) { - log.Fatalf("Use the flag -allowexistingsource to ingest nevertheless\n") - } - } + + for i := 0; i < chunks; i++ { + start := i * chunkSize + end := start + chunkSize + if end > all { + end = all + } + + sourceFolderList := strings.Join(folders[start:end], "\",\"") + filter := createFilter(sourceFolderList) + resp, err := makeRequest(client, url, filter) + if err != nil { + return DatasetQuery{}, err + } + defer resp.Body.Close() + processedResp, err := processResponse(resp) + if err != nil { + return foundList, err } + foundList = append(foundList, processedResp...) } + return foundList, err } func createFilter(sourceFolderList string) string { @@ -82,56 +75,31 @@ func createFilter(sourceFolderList string) string { return fmt.Sprintf("%s%s%s", header, sourceFolderList, tail) } -func makeRequest(client *http.Client, url string, filter string) *http.Response { +func makeRequest(client *http.Client, url string, filter string) (*http.Response, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { - log.Fatal("Error creating request: ", err) + return nil, err } req.Header.Set("Content-Type", "application/json") req.Header.Set("filter", filter) - + resp, err := client.Do(req) if err != nil { - log.Fatal(err) + return nil, err } - defer resp.Body.Close() - return resp + return resp, nil } -func processResponse(resp *http.Response) QueryResult { +func processResponse(resp *http.Response) (DatasetQuery, error) { body, _ := io.ReadAll(resp.Body) - var respObj QueryResult + var respObj DatasetQuery if len(body) == 0 { - log.Printf("Warning: Response body is empty") - return respObj + // ignoring empty response... + return respObj, nil } err := json.Unmarshal(body, &respObj) if err != nil { - log.Printf("Error: Failed to parse JSON response: %v", err) - } - return respObj -} - -func confirmIngestion(allowExistingSourceFolder *bool) bool { - color.Set(color.FgYellow) - log.Printf("Warning: The following sourceFolders have already been used") - continueFlag := true - if allowExistingSourceFolder == nil { - log.Printf("Do you want to ingest the corresponding new datasets nevertheless (y/N) ? ") - scanner.Scan() - archiveAgain := scanner.Text() - if archiveAgain != "y" { - continueFlag = false - } - } else { - continueFlag = *allowExistingSourceFolder - } - if continueFlag { - log.Printf("You chose to continue the new datasets nevertheless\n") - } else { - log.Printf("You chose not to continue\n") - log.Printf("Therefore the ingest process is stopped here, no datasets will be ingested\n") + return DatasetQuery{}, fmt.Errorf("failed to parse JSON response: %v", err) } - color.Unset() - return continueFlag + return respObj, nil } diff --git a/datasetIngestor/testForExistingSourceFolder_test.go b/datasetIngestor/testForExistingSourceFolder_test.go index 2dc2c27..7bdc876 100644 --- a/datasetIngestor/testForExistingSourceFolder_test.go +++ b/datasetIngestor/testForExistingSourceFolder_test.go @@ -1,12 +1,13 @@ package datasetIngestor import ( + "bytes" + "io" "net/http" "net/http/httptest" "testing" + "github.com/stretchr/testify/assert" - "io" - "bytes" ) func TestTestForExistingSourceFolder(t *testing.T) { @@ -20,18 +21,18 @@ func TestTestForExistingSourceFolder(t *testing.T) { })) // Close the server when test finishes defer server.Close() - + // Use Client & URL from our local test server client := server.Client() APIServer := server.URL accessToken := "testToken" - allowExistingSourceFolder := false - + folders := []string{"folder1", "folder2"} - - TestForExistingSourceFolder(folders, client, APIServer, accessToken, &allowExistingSourceFolder) + + // TODO test the results of this function + TestForExistingSourceFolder(folders, client, APIServer, accessToken) }) - + t.Run("test with existing folders and allowExistingSourceFolder true", func(t *testing.T) { // Create a mock server server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { @@ -42,16 +43,16 @@ func TestTestForExistingSourceFolder(t *testing.T) { })) // Close the server when test finishes defer server.Close() - + // Use Client & URL from our local test server client := server.Client() APIServer := server.URL accessToken := "testToken" - allowExistingSourceFolder := true - + folders := []string{"folder1", "folder2"} - - TestForExistingSourceFolder(folders, client, APIServer, accessToken, &allowExistingSourceFolder) + + // TODO test the results of this function. + TestForExistingSourceFolder(folders, client, APIServer, accessToken) }) } @@ -61,26 +62,35 @@ func TestProcessResponse(t *testing.T) { resp := &http.Response{ Body: io.NopCloser(bytes.NewBufferString(validJSON)), } - result := processResponse(resp) + result, err := processResponse(resp) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } if len(result) != 1 || result[0].Pid != "123" || result[0].SourceFolder != "folder" || result[0].Size != 100 { t.Errorf("Unexpected result: %v", result) } - + // Test with invalid JSON invalidJSON := `{"pid": "123", "sourceFolder": "folder", "size": 100}` resp = &http.Response{ Body: io.NopCloser(bytes.NewBufferString(invalidJSON)), } - result = processResponse(resp) + result, err = processResponse(resp) + if err == nil { + t.Errorf("Expected error from processResponse") + } if len(result) != 0 { t.Errorf("Expected empty QueryResult, got '%v'", result) } - + // Test with empty body resp = &http.Response{ Body: io.NopCloser(bytes.NewBufferString("")), } - result = processResponse(resp) + result, err = processResponse(resp) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } if len(result) != 0 { t.Errorf("Expected empty QueryResult, got '%v'", result) }