生产实践:
使用Shell脚本操作HDFS中文件数据
学习技巧:
Shell 数组,date日期时间,awk,hdfs命令使用
脚本内容:
通过脚本查询、统计、删除n天前的hdfs文件,然后针对指定的分区路径,做不同的数据周期保存策略。拆成了几个脚本,可以直接拿来用,生产中已验证。注意删除操作请慎用,删除操作最好拆出来单独执行,做好测试,备份,灰度和记录。
HDFS文件查询、统计、删除脚本:list_data_by_day.sh
#!/bin/bash
AGO="`date --date "$1 days ago" "+%F %R"`"
INPATH="${3:-/tmp/hive}"
USAGE="Usage: $0 [N days ago] (listFs|listDir|size|delete) [path, default /tmp/hive]"
if [[ -z "$1" || -z "$2" ]]; then
echo $USAGE
exit 1
fi
echo ">>> 将搜索 $AGO 之前的文件"
echo ">>> 将在 $INPATH 下进行搜索"
case $2 in
listFs)
hdfs dfs -ls -R "$INPATH" |\
awk '$1 ~ /^[^d]/ && ($6 " " $7) < '"\"$AGO\""
;;
listDir)
hdfs dfs -ls "$INPATH" |\
awk '$1 ~ /^d/ && ($6 " " $7) < '"\"$AGO\""
;;
size)
dirSize=$(hdfs dfs -du -s -h "$INPATH" | awk '{print $1" "$2}')
hdfs dfs -ls -R "$INPATH" |\
awk '$1 ~ /^[^d]/ && ($6 " " $7) < "'"$AGO"'" {
sum += $5 ; cnt += 1} END {
print cnt, "个文件,总共", sum, "字节,目录大小", "'"$dirSize"'"}'
;;
delete)
hdfs dfs -ls -R "$INPATH" |\
awk '$1 ~ /^[^d]/ && ($6 " " $7) < "'"$AGO"'" {print $8}' | \
xargs hdfs dfs -rm -skipTrash
;;
*)
echo $USAGE
exit 1
;;
esac
# 将90天前的分区目录列表写入到待删除列表文件中
# ./list_data_by_day.sh 90 listDir hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_carlife_sdk_log_hi | awk 'NR>2 {print $NF}' > chegva_com_carlife_sdk_log_hi-`date +%F`
# hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk 'NF==2 {print $1","$2} NF>2 {print $1$2","$3}' | sort -hr
# hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk '{print $1","$2}' > warehhouse_tablesize_count_20240508
# hdfs dfs -put warehhouse_tablesize_count_20240508 hdfs://anzhihe-hadoop-1234567/user/anzhihe_sz_dw/hive_storage/HDFS文件删除脚本:del_data_by_day.sh
#!/bin/bash
# bash del_data_by_day.sh chegva_com_carlife_sdk_log_hi-2024-03-19
FILEPATH=$1
LOGFILE="${FILEPATH}.log"
techo() {
local msg=$1
local msg_level=${2:-'INFO'}
echo "`date '+%F %T'` - [${msg_level}] ${msg}" >> ${LOGFILE}
}
techo "开始清理历史目录..."
while read line; do
echo hdfs dfs -rm -r -skipTrash $line | tee -a ${LOGFILE}
hdfs dfs -rm -r -skipTrash $line | tee -a ${LOGFILE}
done < ${FILEPATH}
techo "清理历史目录完毕。"
# hdfs dfs -find hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_lbs_tencent_order_df -name "dt=*" | wc -l
# sh list_data_by_day.sh 31 listDir hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_sn_path_list_df | awk 'NR>2 {print $NF}' > chegva_com_lbs_sn_path_list_df-$(date +%F)HDFS文件数据保存策略配置及统计数据上传脚本:keep_data_by_month.sh
#!/bin/bash
# HDFS历史数据保存策略配置及统计
# 30 1 * * * cd /home/hadoop/scripts && bash keep_data_by_day.sh > /dev/null 2>&1
#set -e
source /etc/profile
declare -a keep_one_month=(
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_sn_path_list_df"
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_lbs_tencent_order_df"
......
)
declare -a keep_three_month=(
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_carlife_sdk_log_hi"
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_bind_manage_history_hf"
......
)
declare -a keep_six_month=(
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di"
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_iot_bizinfo_activesn_view_di"
......
)
declare -a keep_one_year=(
"hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_feature_df"
......
)
LOGFILE="keep_data_by_day-`date '+%F_%T'`.log"
techo() {
local msg=$1
local msg_level=${2:-'INFO'}
echo "`date '+%F %T'` - [${msg_level}] ${msg}" >> ${LOGFILE}
}
# 实现月份加减,接受两个参数:日期,操作的月数。日期格式为"yyyy-mm-dd",月数可以为负数,表示减操作。
plus_months() {
date -d $1 +%F > /dev/null
IFS=- read -r year month day <<< $1
num=$2
((month+=num))
if [ $month -gt 12 ];then
((year+=month/12))
month=$((month%12))
else
while [ $month -le 0 ];do
((month+=12))
((year-=1))
done
fi
days=$(get_days_of_month $year $month)
if [ $days -lt $day ];then
day=$days
fi
date -d $year-$month-$day +%F
}
# 获取指定月份的天数,解决2月份28或29天造成日期不准的问题
get_days_of_month() {
date_str=$(date -d "$1-$2-01 +1 month -1 day" +%F)
date -d $date_str +%d
}
keep_data_by_month() {
#local date_ago="`date -d "$1 days ago" "+%Y-%m-%d"`"
local today=$(date +%F)
local date_ago=$(plus_months $today $1)
shift
local path=("$@")
for i in ${path[@]}; do
echo hdfs dfs -rm -r -skipTrash $i/dt=$date_ago >> ${LOGFILE}
hdfs dfs -rm -r -skipTrash $i/dt=$date_ago >> ${LOGFILE}
done
}
# 统计/user/hive/warehouse/所有表数据量大小,上传cos
count_warehouse_tablesize() {
hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk '{print strftime("%Y-%m-%d",systime())","$1","$2}' > warehhouse_tablesize_count_`date '+%F'`.csv
hdfs dfs -put warehhouse_tablesize_count_`date '+%F'`.csv hdfs://anzhihe-hadoop-1234567/user/chegva_com/hive_storage/
}
del_log() {
# 删除一周前的日志记录文件
find . -type f -name "keep_data_by_day*.log" -mtime +6 -delete
# 删除一周前的表数据统计文件
find . -type f -name "warehhouse_tablesize_count*.csv" -mtime +6 -delete
}
main() {
techo "开始清理数据....."
techo "删除一个月前分区数据!"
keep_data_by_month -1 "${keep_one_month[@]}"
techo "删除三个月前分区数据!"
keep_data_by_month -3 "${keep_three_month[@]}"
techo "删除六个月前分区数据!"
keep_data_by_month -6 "${keep_six_month[@]}"
techo "删除一年前分区数据!"
keep_data_by_month -12 "${keep_one_year[@]}"
techo "清理数据完毕。"
techo "开始统计warehouse表空间大小..."
count_warehouse_tablesize
techo "统计warehouse表空间大小完毕。"
del_log
}
main查看效果:
2024-05-17 10:07:20 - [INFO] 开始清理数据..... 2024-05-17 10:07:20 - [INFO] 删除一个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/ods_anzhihe_sn_sdk_log_di/dt=2024-04-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/ods_anzhihe_sn_sdk_log_di/dt=2024-04-17 ...... 2024-05-17 10:07:20 - [INFO] 删除三个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_tencent_map_log_di/dt=2024-02-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_tencent_map_log_di/dt=2024-02-17 ...... 2024-05-17 10:07:20 - [INFO] 删除六个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di/dt=2023-11-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di/dt=2023-11-17 ...... 2024-05-17 10:07:20 - [INFO] 删除一年前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/dw_anzhihe_sn_feature_df/dt=2023-05-17 ...... 2024-05-17 10:07:20 - [INFO] 开始统计warehouse表空间大小... 2024-05-17 10:07:20 - [INFO] 统计warehouse表空间大小完毕。
参考: