bot

Check for fake googlebot scrapers

I noticed a bot scraping using fake GoogleBot useragent string.

Here is a one liner that can detect the IPs to ban:

$ awk 'tolower($0) ~ /googlebot/ {print $1}' /var/www/httpd/access_log | grep -v 66.249.71. | sort | uniq -c | sort -n

It does a case-insensitive awk search for keyword "googlebot" from apache log file removing IPs with "66.249.71." which belongs to google and prints the output in a sorted hit count.

You can validate the IPs with:

IP=66.249.71.37 ; reverse=$(dig -x $IP +short | grep googlebot.com) ; ip=$(dig $reverse +short) ; [ "$IP" = "$ip" ] && echo $IP GOOD || echo $IP FAKE

Replace the IP value with the one you want to check.

Search bot report

Here is a simple bash script to get a daily report of search bot results of Success (200) and Failed (404) hits:

#!/bin/bash
# bot_report.sh
# usage: ./bot_report.sh [botName] [logPath]
# default: ./bot_report.sh Googlebot /var/log/httpd/access_log

######################################
# Run this in a daily cron           #
# 59 23 * * * /path/to/bot_report.sh #
######################################

# Commands
GREP=/bin/grep
DATE=/bin/date
MKDIR=/bin/mkdir
AWK=/bin/awk
SORT=/bin/sort
UNIQ=/usr/bin/uniq
TMPWATCH=/usr/sbin/tmpwatch<br />CAT=/bin/cat
MAIL=/bin/mail
ECHO=/bin/echo

# Global Variables
DEFAULT_BOT_NAME=Googlebotr />DEFAULT_LOG_FILE=/var/log/httpd/access_log
if [ -z "$1" ]; then
BOT_NAME=${DEFAULT_BOT_NAME}r />else
BOT_NAME=$1
fi
if [ -z "$2" ]; then
LOG_FILE=${DEFAULT_LOG_FILE}r />else
LOG_FILE=$2
fi
PREFIX_LOG=`$ECHO $LOG_FILE | sed 's/\//_/g'`
TMP_LOG_PATH=/tmp/bot_report
TMP_LOG_FILE=${TMP_LOG_PATH}/${PREFIX_LOG}_`${DATE} +%F`.log
TMP_REPORT_FILE=${TMP_LOG_PATH}/${BOT_NAME}_report.txt
EMAIL=user@domain.tld

###########################<br /># Nothing to change below #
###########################<br />
# Produce a temp file to work with for todays date
tmp_file_out() {
[ -d "${TMP_LOG_PATH}" ] || $MKDIR ${TMP_LOG_PATH}
if [ ! -f "${TMP_LOG_FILE}" ]; then
$GREP `$DATE +%d/%b/%Y` $LOG_FILE > $TMP_LOG_FILE
fi
}

# Clean up temp file older than a day
tmp_file_clean() {
$TMPWATCH 24 $TMP_LOG_PATH
}

# Create report
report_out() {
$ECHO "######## Success Hits ########" > $TMP_REPORT_FILE
$GREP " 200 " $TMP_LOG_FILE | $AWK -v bot="$BOT_NAME" '$0 ~ bot {print $7}' | $SORT | $UNIQ -c | $SORT -rn >> $TMP_REPORT_FILE
$ECHO >> $TMP_REPORT_FILE
$ECHO "######## Failed Hits ########" >> $TMP_REPORT_FILE
$GREP " 404 " $TMP_LOG_FILE | $AWK -v bot="$BOT_NAME" '$0 ~ bot {print $7}' | $SORT | $UNIQ -c | $SORT -rn >> $TMP_REPORT_FILE
}

# Mail report
mail_report() {
$CAT $TMP_REPORT_FILE | $MAIL -s "bot report: ${BOT_NAME}" $EMAIL
}

#
# Main
#
tmp_file_out
report_out
mail_report
tmp_file_clean

exit 0

Comment