Bash script to upload logs to Google Cloud Storage

The following Bash script is using the Google Gsutil tool to upload hourly rotated server logs files (statistics) to Google Cloud.
For a Bash script to upload logs files from Google Cloud to Google Bigquery please read Bash script to upload from Google Cloud storage to Google Bigquery.

  1. Because the logs are rotating every hour, the script work only on files that their last modified time is bigger than 61 minutes. This is essential for validating that the process writing the logs finishes writing logs to this file.
  2. Need to split files that their size exceed maximum file size allowed by Google Cloud.
  3. Rename file to include server name + date + num rows. The reasons are:
    • To mark this file as in process so new running script will not start working on this file.
    • Because there are several log servers that uploading the logs at the same hour, need to include unique server name and date so files not overwrite each other at cloud.
    • The name include rows number to ease the process of validating this log file rows are indeed inserted to the BigQuery table.
  4. Sed file if needed
  5. Gzip the file
  6. Upload the file to cloud and validate.
 
#!/bin/bash
 
###### ABOUT ######
# Author: Ilan Hazan
# This Bash Script uploads logs files from local server to Google Cloud.
# 1. Because the logs are rotating every hour, the script work only on files that their last modified time is bigger than 61 minutes. This is essential for validating that the process writing the logs finishes writing logs to this file. 		
# 2. Need to split files that their size exceed maximum file size allowed by Google Cloud.  
# 3. Rename file to include server name + date + num rows. The reasons are:
# 3.1. To mark this file as in process so new running script will not start working on this file.  
# 3.2. Because there are several log servers that uploading the logs at the same hour, need to include unique server name and date so files not overwrite each other at cloud.
# 3.3. The name include rows number to ease the process of validating this log file rows are indeed inserted to the BigQuery table.  
# 4. Sed file if needed
# 5. Gzip the file
# 6. Upload the file to cloud and validate.
# 
 
###### NOTES ######
# need to call it with ./myScript.sh >> /tmp/googleCloud.log 2>&1 in order to redirect stderr to the log file 
 
##### SCRIPT CONFIGURATIONS #####
# database credentials
DB="staging"
# the subject of your log. Assuming there are many log types
Subject="logType"
FilePrefix="rnd_"
 
# run the script for logs 
Date="2 hour ago"
if [ $# -eq 3 ]; then
    Date="$@"
fi
# there is a possibility to SED the logs. If needed, fill in the SED string 
SedString=''
# there is a possibility to GREP the logs. If needed, fill in the GREP string (regular expression)
GrepString=''
 
 
##### FUNCTIONS GLOBALS #####
GSUTILPath="/opt/gsutil/gsutil"
###################
 
##### SCRIPT GLOBALS #####
LoggerId="loggerNumber"
SleepTime="2m"
LoopTries=3
# Google BigQuery not allow files bigger than that size
MaxCompressedFileSize=4000000000
GrepExtension="greped"
SedExtension="seded"
UploadedExtension="uploaded"
CloudExtension="candidate"
 
##########################
 
 
##### FUNCTIONS ##########
 
# Output logs to Stderr with date 
# $1 output string
# Return: None
function myLogger(){
	echo "$(date) $1 " >&2
}
 
function fail() {
    exitcode=$1
    shift
    echo "exited with level $exitcode, error was '$@'" >&2
    exit $exitcode
}
 
 
# remove last extension of a file name
# $1 the file name
# Return: the file name without last extension
function removeLastExtension(){
	local fileName=$1
	echo ${fileName%.*}
}
 
 
 
# Making sed to the file. If needed creating file with extension *.sed
# $1 the file name
# $2 sed string
# Globals -
#      SedExtension: the extension given to a files that passed sed
# Return: the file name to continue working on. In case of empty Sed String the suffix will remain the same. Otherwise the the name will be *.sed.
function sedTheFile(){
	local fileName=$1
	local sedString=$2
	local fileNameWithoutExtension=$(removeLastExtension $fileName)
	local retValue=""
 
	if [ -n "$sedString" ]; then
		myLogger "sed for $fileName is not empty"
		sed "$sedString" $fileName > "$fileNameWithoutExtension.$SedExtension"
		myLogger "$fileName finished sed"
		retValue="$fileNameWithoutExtension.$SedExtension"
	else
		myLogger "sed for $fileName is empty"
		retValue=$fileName
	fi
	echo "$retValue"
}
 
 
# Making grep to the file. If needed creating file with extension *.greped
# $1 the file name
# $2 grep string
# Globals -
#      GrepExtension: the extension given to a files that passed sed
# Return: the file name to continue working on. In case of empty Grep String the suffix will remain the same. Otherwise the the name will be *.greped.
function grepTheFile(){
	local fileName=$1
	local grepString=$2
	local fileNameWithoutExtension=$(removeLastExtension $fileName)
	local retValue;
 
	if [ -n "$grepString" ]; then
		myLogger "grep for $rawFile is not empty"
		LC_ALL=C grep "$grepString" $fileName > "$fileNameWithoutExtension.$GrepExtension"
		myLogger "$fileName finished grep "
		retValue="$fileNameWithoutExtension.$GrepExtension"
	else
		myLogger "grep for $fileName is empty "
		retValue=$fileName
	fi
	echo "$retValue"
}
 
 
# extract the hour from the file name. The hour located between the first two dots. 
# $1 the file name
# Return: hour
function extractHourFromFileName(){
   echo "$1" | cut -d'.' -f 2
}
 
# Validate that file size is smaller than max configured. Fails if not!
# $1 the file name
# $2 max file size  
# Return: None. Fails upon failure!
function validateFileSize(){
		local fileName=$1
		local maxFileSize=$2
		local compressedFileSize=$(stat -c%s "$fileName")
		myLogger "File "$fileName" size is: $compressedFileSize. max allowed: $maxFileSize"
		if [ $compressedFileSize -gt $maxFileSize ]; then
			myLogger "Error: File "$fileName" is larger than $maxFileSize: $compressedFileSize "
			fail 1 "File "$fileName" is larger than $maxFileSize, please investigate"
		fi
}
 
# get file details from Cloud. 
# $1 the file name including path
# Globals:
#     GSUTILPath - the path to gsutil (Google application for uploading files)
# Return: 0 upon suceess
function getFileDetailsFromCloud(){
	$GSUTILPath list -l $1
}
 
 
# Validate that file uploaded successfully. Fails if not!
# $1 the file name including path
# Return: None. Fails upon failure!
function validateFileLocatedInCloud(){
	local fileName=$1
	local loopTries=$2
	local sleepTime=$3
 
	myLogger "going to validate file exist in Cloud: $fileName "
	getFileDetailsFromCloud $fileName
	local ReturnedResponse=$?
	local var=0
	while [ $ReturnedResponse -ne 0 -a $var -lt $loopTries ]; do
			((var=var+1))
			myLogger "Error: $fileName is not located in Google Cloud as it should be. Response code $ReturnedResponse for $var time" ; date
			sleep $sleepTime
			getFileDetailsFromCloud $fileName
			ReturnedResponse=$?
	done
 
	if [[ $ReturnedResponse -ne 0 ]] ; then
			myLogger "Error: file does not exist in Cloud: $fileName"
			fail 1 "file $fileName is not exist in cloud"
	else
			myLogger "Validated file exist in Cloud: $fileName "
	fi
}
 
# Upload file to Google cloud
# $1 the file path
# $2 the file name
# $3 loop tries - number of tries before declaring failure
# $4 sleep time - sleep between tries
# Globals:
#     GSUTILPath - 
# Return: None. Fails upon failure!
function uploadFileToCloud(){
	local filePath=$1
	local fileName=$2
	local loopTries=$3
	local sleepTime=$4
 
	myLogger "going to run: $GSUTILPath cp $fileName $filePath "
	$GSUTILPath cp $fileName $filePath
	local ReturnedResponse=$?
	local var=0
	while [ $ReturnedResponse -ne 0 -a $var -lt $loopTries ]; do
			((var=var+1))
			myLogger "Error: $fileName Fail uploading to Google Cloud $filePath with response code $ReturnedResponse for $var time" ; date
			sleep $sleepTime
			$GSUTILPath cp $fileName $filePath
			ReturnedResponse=$?
	done
 
	if [[ $ReturnedResponse -ne 0 ]] ; then
		myLogger "Error: $fileName Fail uploading to Google Cloud $filePath with response code $ReturnedResponse after $loopTries times"
		fail 1 "could not upload file $loopTries to cloud, please investigate"
	fi
}
 
##############################
 
######### DEFAULTS ###########
 
if [[ -z "$Destination" ]]; then
	Destination="$Subject"
fi
 
WorkingDir=$(date -d "$Date" +"%Y/%m/%d")
if [[ -z "$BQDate" ]]; then
	DateInTableName=${WorkingDir//\//}
else
	DateInTableName=$(date -d "$BQDate" +"%Y%m%d")
fi
 
if [[ -z "$FileSizeBytesLimit" ]]; then
	FileSizeBytesLimit="17000000000"
fi
 
BaseDir="/the/directory/contains/logs/" # base directory
FileName="$FilePrefix$Subject"
 
# note: CloudDir is used for creating BQ table name in other scripts (must be alphanumeric plus underscores )
CloudDir="gs://rndblog/$DB/$Subject/${DateInTableName}_"
##########################################
 
 
for rawFile in $( find $BaseDir$WorkingDir -name "$FileName.*.log" -mmin +61 )
do
    if [ -f $rawFile ]
	then
		mv $rawFile $rawFile.try
		TempFileName="$rawFile.try"
		myLogger ""
		myLogger "**Start working on $rawFile "
 
		# take row count orig
		rowCountOrig=$(wc -l < $TempFileName)
		myLogger "$TempFileName row count is: $rowCountOrig "
 
		# sed the file if needed
		TempFileName=$(sedTheFile $TempFileName "$SedString")
 
		# grep the file if needed 
		TempFileName=$(grepTheFile $TempFileName "$GrepString")
		myLogger "TempFileName is $TempFileName"
		# At this stage there can be orig file + sed? + grep?
 
		# take row count after grep (if was a grep)
		rowCountAfterGrep=$rowCountOrig
		if [ -n "$GrepString" ]; then
			# will get wc without file name
			rowCountAfterGrep=$(wc -l < $TempFileName)
			myLogger "$TempFileName row count after grep is: $rowCountAfterGrep "
		fi
 
		# rename file to include server name + date + num rows
		FileRowsDateAndServerName="$rawFile.$rowCountAfterGrep.$DateInTableName.$LoggerId"
		FileRowsDiffDateAndServerName="$rawFile.$rowCountOrig-$rowCountAfterGrep.$DateInTableName.$LoggerId"
		myLogger "FileRowsDateAndServerName:$FileRowsDateAndServerName FileRowsDiffDateAndServerName:$FileRowsDiffDateAndServerName"
		mv $TempFileName $FileRowsDateAndServerName.csv
 
		# make gzip
		gzip $FileRowsDateAndServerName.csv
		mv $FileRowsDateAndServerName.csv.gz $FileRowsDateAndServerName.csv.$CloudExtension
		TempFileName="$FileRowsDateAndServerName.csv.$CloudExtension"
 
		# Check if file is larger than 4G
		validateFileSize $TempFileName $MaxCompressedFileSize
 
		# extract the hour from file name
		FileHour=$(extractHourFromFileName $rawFile)
 
		# upload the file to Cloud
		UploadingDirectoryForThisFile="$CloudDir$FileHour/"
		uploadFileToCloud $UploadingDirectoryForThisFile $TempFileName $LoopTries $SleepTime
 
		# validating file exist in Cloud
		BaseTempFileName=$(basename $TempFileName)
		validateFileLocatedInCloud "$UploadingDirectoryForThisFile$BaseTempFileName" $LoopTries $SleepTime
 
		# Making cleanup
		# Need to save the original file with two numbers on its name: the orig rows and the final rows
		# Rename file so we know it is uploaded successfully to cloud
		if [ -n $rawFile.try ]; then
			rm -f "$rawFile.$SedExtension"
			rm $TempFileName
			mv $rawFile.try $FileRowsDiffDateAndServerName.$UploadedExtension.csv
			gzip $FileRowsDiffDateAndServerName.$UploadedExtension.csv
		else
			# a case that there wasn't Sed and Grep
			mv $TempFileName $FileRowsDateAndServerName.csv.$UploadedExtension.gz
		fi
	fi
done
Leave a Reply

*