diff --git a/README.md b/README.md index 9fd085c..58e7b7e 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,12 @@ Allows the script to run successfully in a non-interactive shell. The script will utilize the default [`--location`](#l-location) and [`--filename`](#f-filename) settings unless the respective flags are explicitely set. +### ignore-robots + +- Usage: `-i`, `--ignore-robots` + +Ignore robots.txt for the domain. + ### wget - Usage: `-w`, `--wget` @@ -242,6 +248,6 @@ In addition, specific site (including WordPress) files and directories are also ## Advanced Usage -The script should filter out most unwanted file types and directories; however, you can edit the regular expressions that filter out certain pages, directories, and file types by editing the `fetchSiteUrls()` function within the `fetchurls.sh` file. +The script should filter out most unwanted file types and directories; however, you can edit the regular expressions that filter out certain pages, directories, and file types by editing the `fetchUrlsForDomain()` function within the `fetchurls.sh` file. **Warning**: If you're not familiar with [grep](https://man7.org/linux/man-pages//man1/grep.1.html) or regular expressions, you can easily break the script. diff --git a/fetchurls.sh b/fetchurls.sh index 1476652..a6007f1 100644 --- a/fetchurls.sh +++ b/fetchurls.sh @@ -1,10 +1,11 @@ #!/bin/bash -VERSION="v3.2.1" +VERSION="v3.2.2" # Set Defaults WGET_INSTALLED=0 RUN_NONINTERACTIVE=0 +IGNORE_ROBOTS=0 SHOW_HELP=0 SHOW_WGET_INSTALL_INFO=0 SHOW_VERSION=0 @@ -56,6 +57,9 @@ while (( "$#" )); do -t|--troubleshooting) SHOW_TROUBLESHOOTING=1 ;; + -i|--ignore-robots) + IGNORE_ROBOTS=1 + ;; # DOMAIN -d|--domain) if [ "$2" ]; then @@ -254,6 +258,8 @@ showHelp() echo " -n, --non-interactive Allows the script to run successfully in a non-interactive shell." echo " Uses the default --location and --filename settings unless the corresponding flags are set." echo "" + echo " -i, --ignore-robots Ignore robots.txt for the domain." + echo "" echo " -w, --wget Show wget install instructions." echo " The installation process may vary depending on your computer's configuration." echo "" @@ -297,6 +303,7 @@ showTroubleshooting() echo " SHOW_HELP: ${COLOR_CYAN}$SHOW_HELP${COLOR_RESET}" echo " SHOW_VERSION: ${COLOR_CYAN}$SHOW_VERSION${COLOR_RESET}" echo " RUN_NONINTERACTIVE: ${COLOR_CYAN}$RUN_NONINTERACTIVE${COLOR_RESET}" + echo " IGNORE_ROBOTS: ${COLOR_CYAN}$IGNORE_ROBOTS${COLOR_RESET}" echo " USER_DOMAIN: ${COLOR_CYAN}$USER_DOMAIN${COLOR_RESET}" echo " USER_FILENAME: ${COLOR_CYAN}$USER_FILENAME${COLOR_RESET}" echo " USER_SAVE_LOCATION: ${COLOR_CYAN}$USER_SAVE_LOCATION${COLOR_RESET}" @@ -335,7 +342,7 @@ displaySpinner() } fetchUrlsForDomain() { - cd $USER_SAVE_LOCATION && wget --spider -r -nd --max-redirect=30 $USER_SLEEP $USER_CREDENTIALS $USER_DOMAIN 2>&1 \ + cd $USER_SAVE_LOCATION && wget --spider -r -nd --max-redirect=30 $IGNORE_ROBOTS $USER_SLEEP $USER_CREDENTIALS $USER_DOMAIN 2>&1 \ | grep '^--' \ | awk '{ print $3 }' \ | grep -E -v '\.('${USER_EXCLUDED_EXTENTIONS}')(\?.*)?$' \ @@ -528,6 +535,13 @@ else USER_SLEEP="--wait=${USER_SLEEP}" fi +# Check for IGNORE_ROBOTS +if [ -z "$IGNORE_ROBOTS" ] || [ "$IGNORE_ROBOTS" -eq 0 ]; then + IGNORE_ROBOTS= +else + IGNORE_ROBOTS="--execute robots=off" +fi + # Check for credentials if [ -z "$USER_CREDENTIALS_USERNAME" ] || [ -z "$USER_CREDENTIALS_PASSWORD" ]; then USER_CREDENTIALS=