-
Notifications
You must be signed in to change notification settings - Fork 3
/
gpu.sh
159 lines (150 loc) · 6.17 KB
/
gpu.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
#
# GPU Monitor and Restart Script for EthOS (by cYnIxX3)
#
# Version 1.3.1
#
# INSTALL: `source <(curl -s https://raw.githubusercontent.com/cynixx3/GPU-crash-detector-for-ethOS/master/gpu-install.txt)`
# UPDATE: `wget https://raw.githubusercontent.com/cynixx3/GPU-crash-detector-for-ethOS/master/gpu-install.txt -O /home/ethos/gpu.sh`
# VIEW LOG: `tail /home/ethos/crash.log`
# RUN MANUALLY: `./gpu.sh` (note: it will run every 5 minutes after install.)
#
# !!!!!WARNING!!!!! This is not an official EthOS script. It is not supported
# anywhere. You use this script at your own risk. There is a reason EthOS does
# not include these self regulating reboots. Be sure you have a well built rig.
#
# BY USING THE REBOOT FEATURE IN ANY SCRIPT YOU ARE BYPASSING ETHOS SAFEGARDS
# WHICH CAN RESULT IN YOUR MINER CATCHING ON FIRE!!!!!!!!!!
#
# Set "autoreboot 5" in local.conf if you want this sript to reboot your miner
# Where 5 is the number of times it will reboot. Run `clear-thermals` to reset count.
# or "echo 0 > /opt/ethos/etc/autorebooted.file"
#
#####################################################################################
# If you found this script useful please donate BitCoin to:
# BTC 1G6DcU8GrK1JuXWEJ4CZL2cLyCT57r6en2
# or Ethereum to:
# ETH 0x42D23fC535af25babbbB0337Cf45dF8a54e43C37
#####################################################################################
# CONFIGURATION
logsize="100" # Set the number of lines to keep in the log
loggood="false" # Set "true" if you want an hourly log of success
crashspeed="0" # Set the per gpu hash rate that will trigger a crash condition
minuptime="21600" # Set the number in seconds the script should wait after start before rebooting
# INFORMATION COLLECTION
uptime=$(sed 's/\..*//' /proc/uptime)
minersec=$(grep miner_secs: /var/run/ethos/stats.file | cut -d : -f 2)
updating=$(cat /var/run/ethos/updating.file)
allow=$(cat /opt/ethos/etc/allow.file)
throttled=$(cat /var/run/ethos/throttled.file)
oldIFS=$IFS IFS=" "
read -r -a selectedgpus <<< "$(/opt/ethos/sbin/ethos-readconf selectedgpus)"
read -r -a tempatures <<< "$(grep ^temp: /var/run/ethos/stats.file | cut -d : -f 2 | sed 's/\...//g')"
read -r -a memstates <<< "$(grep ^memstates: /var/run/ethos/stats.file | cut -d : -f 2)"
read -r -a powertunes <<< "$(grep ^powertune: /var/run/ethos/stats.file | cut -d : -f 2)"
read -r -a hashrates <<< "$(tail -1 /var/run/ethos/miner_hashes.file)"
IFS=$oldIFS
hostname=$(hostname)
loc=$(/opt/ethos/sbin/ethos-readconf loc)
DT=$(date +"%D %T")
jumpgpu=0
number='[1-9]'
autoreboot=$(/opt/ethos/sbin/ethos-readconf autoreboot)
rebcount=$(cat /opt/ethos/etc/autorebooted.file)
error=$(cat /var/run/ethos/status.file)
panelURL=$(cat /var/run/ethos/url.file)
if ping -q -c 1 -W 1 8.8.8.8 >/dev/null ; then
net="true"
json=$(curl -s "$panelURL"/?json=yes | sed 's/"},/"}\n/g' | grep "$hostname" | sed -e 's/^{"rigs"://' -e 's/[{"}]//g' -e 's/,/\n/g')
if [[ $json == "" ]] ; then
jcondition="Failing Panel Verification"
else
jcondition=$(printf "%s" "$json" | grep condition: | cut -d : -f 3)
fi
else
net="false"
fi
# LOG STUFF
touch /home/ethos/crash.log
/bin/cp /home/ethos/crash.log /tmp/crash.log
/usr/bin/sudo /bin/chown ethos.ethos /home/ethos/crash.log /tmp/crash.log
function f.truncatelog(){
tail -n $logsize /tmp/crash.log > /home/ethos/crash.log
}
# BASIC RIG CHECKS
if [[ $uptime -lt $minuptime ]] \
|| [[ $minersec -lt "300" ]]; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc has not tried mining long enough $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 2
elif [[ $updating -eq "1" ]]; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc miner is updating $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 3
elif [[ $allow -eq "0" ]]; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc is Disallowed by user command. Run \'allow\' to start mining $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 5
elif [[ $net == "false" ]] ; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc No internet connection $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 7
elif [[ $throttled -eq "1" ]] ; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc has a GPU that overheated - http://ethosdistro.com/kb/#managing-temperature $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 11
elif [[ $error = "hardware error: possible gpu/riser/power failure" ]] ; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$error - http://ethosdistro.com/kb/#adl $DT" | tee -a /tmp/crash.log
f.truncatelog
fi
exit 13
fi
# MAIN GPU ERROR LOOP
for gpu in ${!hashrates[*]} ; do
# HARDWARE ERROR CHECKING (AKA FIRE PREVENTION CHECKS)
if [[ "${tempatures[gpu]}" == "511" ]] \
|| [[ "${powertunes[gpu]}" = "-1" ]] ; then
if [[ $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$loc - GPU$gpu may have a critical hardware problem that rebooting may not solve. $jcondition - $error$DT" | tee -a /tmp/crash.log
fi
continue
fi
if [[ "${selectedgpus[jumpgpu]}" > "$gpu" ]] ; then
if [ -t 1 ] ; then
echo "GPU$gpu is configured offine with the 'sel' config option - skipping checks on GPU$gpu"
fi
continue
fi
# GPU CRASH DETECTION
if [[ $error == "gpu crashed: reboot required" ]] \
|| [[ $error == "possible miner stall: check miner log" ]] \
|| [[ $(bc <<< "${hashrates[gpu]} <= $crashspeed") -eq "1" ]] ; then
printf '%s GPU%s CRASHED %s\n -Status: %s - %s\n -Hashrate: %s\n -MemState: %s\n -PowerState: %s\n' "$loc" "$gpu" "$DT" "$jcondition" "$error" "${hashrates[*]}" "${memstates[*]}" "${powertunes[*]}"| tee -a /tmp/crash.log
f.truncatelog
if [[ $autoreboot =~ $number ]] && [[ $autoreboot -gt $rebcount ]] ; then
((rebcount++))
echo $rebcount > /opt/ethos/etc/autorebooted.file
/usr/bin/sudo /opt/ethos/bin/hard-reboot
else
echo "$loc has been rebooted too many times. Check why and run \'clear-thermals\' to reset $DT"
fi
exit 17
fi
((jumpgpu++))
done
# GOOD LOGGING
if [[ $loggood == "true" && $(date +"%M") == "00" ]] || [ -t 1 ] ; then
echo "$DT: $loc is $jcondition at $error" | tee -a /tmp/crash.log
f.truncatelog
fi