Check for fake googlebot scrapers
Wed, 10/05/2011 - 12:13 — sandipI noticed a bot scraping using fake GoogleBot useragent string.
Here is a one liner that can detect the IPs to ban:
$ awk 'tolower($0) ~ /googlebot/ {print $1}' /var/www/httpd/access_log | grep -v 66.249.71. | sort | uniq -c | sort -n
It does a case-insensitive awk search for keyword "googlebot" from apache log file removing IPs with "66.249.71." which belongs to google and prints the output in a sorted hit count.
You can validate the IPs with:
IP=66.249.71.37 ; reverse=$(dig -x $IP +short | grep googlebot.com) ; ip=$(dig $reverse +short) ; [ "$IP" = "$ip" ] && echo $IP GOOD || echo $IP FAKE
Replace the IP value with the one you want to check.
- sandip's blog
- Login or register to post comments
- Read more
Search bot report
Mon, 03/22/2010 - 13:41 — sandipHere is a simple bash script to get a daily report of search bot results of Success (200) and Failed (404) hits:
#!/bin/bash
# bot_report.sh
# usage: ./bot_report.sh [botName] [logPath]
# default: ./bot_report.sh Googlebot /var/log/httpd/access_log
############################ ##########
# Run this in a daily cron &n bsp; &n bsp; #
# 59 23 * * * /path/to/bot_report.sh #
############################ ##########
# Commands
GREP=/bin/grep
DATE=/bin/date
MKDIR=/bin/mkdir
AWK=/bin/awk
SORT=/bin/sort
UNIQ=/usr/bin/uniq
TMPWATCH=/usr/sbin/tmpwatch< br />CAT=/bin/cat
MAIL=/bin/mail
ECHO=/bin/echo
# Global Variables
DEFAULT_BOT_NAME=Googlebotr />DEFAULT_LOG_FILE=/var/log/ht tpd/access_log
if [ -z "$1" ]; then
BOT_NAME=${DEFAULT_BOT_NAME}r />else
BOT_NAME=$1
fi
if [ -z "$2" ]; then
LOG_FILE=${DEFAULT_LOG_FILE}r />else
LOG_FILE=$2
fi
PREFIX_LOG=`$ECHO $LOG_FILE | sed 's/\//_/g'`
TMP_LOG_PATH=/tmp/bot_report
TMP_LOG_FILE=${TMP_LOG_PATH} /${PREFIX_LOG}_`${DATE} +%F`.log
TMP_REPORT_FILE=${TMP_LOG_PA TH}/${BOT_NAME}_report.txt
EMAIL=user@domain.tld
###########################< br /># Nothing to change below #
###########################< br />
# Produce a temp file to work with for todays date
tmp_file_out() {
[ -d "${TMP_LOG_PATH}" ] || $MKDIR ${TMP_LOG_PATH}
if [ ! -f "${TMP_LOG_FILE}" ]; then
$GREP `$DATE +%d/%b/%Y` $LOG_FILE > $TMP_LOG_FILE
fi
}
# Clean up temp file older than a day
tmp_file_clean() {
$TMPWATCH 24 $TMP_LOG_PATH
}
# Create report
report_out() {
$ECHO "######## Success Hits ########" > $TMP_REPORT_FILE
$GREP " 200 " $TMP_LOG_FILE | $AWK -v bot="$BOT_NAME" '$0 ~ bot {print $7}' | $SORT | $UNIQ -c | $SORT -rn >> $TMP_REPORT_FILE
$ECHO >> $TMP_REPORT_FILE
$ECHO "######## Failed Hits ########" >> $TMP_REPORT_FILE
$GREP " 404 " $TMP_LOG_FILE | $AWK -v bot="$BOT_NAME" '$0 ~ bot {print $7}' | $SORT | $UNIQ -c | $SORT -rn >> $TMP_REPORT_FILE
}
# Mail report
mail_report() {
$CAT $TMP_REPORT_FILE | $MAIL -s "bot report: ${BOT_NAME}" $EMAIL
}
#
# Main
#
tmp_file_out
report_out
mail_report
tmp_file_clean
exit 0
- sandip's blog
- Login or register to post comments