生产实践:
使用Shell脚本操作HDFS中文件数据
学习技巧:
Shell 数组,date日期时间,awk,hdfs命令使用
脚本内容:
通过脚本查询、统计、删除n天前的hdfs文件,然后针对指定的分区路径,做不同的数据周期保存策略。拆成了几个脚本,可以直接拿来用,生产中已验证。注意删除操作请慎用,删除操作最好拆出来单独执行,做好测试,备份,灰度和记录。
HDFS文件查询、统计、删除脚本:list_data_by_day.sh
#!/bin/bash AGO="`date --date "$1 days ago" "+%F %R"`" INPATH="${3:-/tmp/hive}" USAGE="Usage: $0 [N days ago] (listFs|listDir|size|delete) [path, default /tmp/hive]" if [[ -z "$1" || -z "$2" ]]; then echo $USAGE exit 1 fi echo ">>> 将搜索 $AGO 之前的文件" echo ">>> 将在 $INPATH 下进行搜索" case $2 in listFs) hdfs dfs -ls -R "$INPATH" |\ awk '$1 ~ /^[^d]/ && ($6 " " $7) < '"\"$AGO\"" ;; listDir) hdfs dfs -ls "$INPATH" |\ awk '$1 ~ /^d/ && ($6 " " $7) < '"\"$AGO\"" ;; size) dirSize=$(hdfs dfs -du -s -h "$INPATH" | awk '{print $1" "$2}') hdfs dfs -ls -R "$INPATH" |\ awk '$1 ~ /^[^d]/ && ($6 " " $7) < "'"$AGO"'" { sum += $5 ; cnt += 1} END { print cnt, "个文件,总共", sum, "字节,目录大小", "'"$dirSize"'"}' ;; delete) hdfs dfs -ls -R "$INPATH" |\ awk '$1 ~ /^[^d]/ && ($6 " " $7) < "'"$AGO"'" {print $8}' | \ xargs hdfs dfs -rm -skipTrash ;; *) echo $USAGE exit 1 ;; esac # 将90天前的分区目录列表写入到待删除列表文件中 # ./list_data_by_day.sh 90 listDir hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_carlife_sdk_log_hi | awk 'NR>2 {print $NF}' > chegva_com_carlife_sdk_log_hi-`date +%F` # hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk 'NF==2 {print $1","$2} NF>2 {print $1$2","$3}' | sort -hr # hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk '{print $1","$2}' > warehhouse_tablesize_count_20240508 # hdfs dfs -put warehhouse_tablesize_count_20240508 hdfs://anzhihe-hadoop-1234567/user/anzhihe_sz_dw/hive_storage/
HDFS文件删除脚本:del_data_by_day.sh
#!/bin/bash # bash del_data_by_day.sh chegva_com_carlife_sdk_log_hi-2024-03-19 FILEPATH=$1 LOGFILE="${FILEPATH}.log" techo() { local msg=$1 local msg_level=${2:-'INFO'} echo "`date '+%F %T'` - [${msg_level}] ${msg}" >> ${LOGFILE} } techo "开始清理历史目录..." while read line; do echo hdfs dfs -rm -r -skipTrash $line | tee -a ${LOGFILE} hdfs dfs -rm -r -skipTrash $line | tee -a ${LOGFILE} done < ${FILEPATH} techo "清理历史目录完毕。" # hdfs dfs -find hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_lbs_tencent_order_df -name "dt=*" | wc -l # sh list_data_by_day.sh 31 listDir hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_sn_path_list_df | awk 'NR>2 {print $NF}' > chegva_com_lbs_sn_path_list_df-$(date +%F)
HDFS文件数据保存策略配置及统计数据上传脚本:keep_data_by_month.sh
#!/bin/bash # HDFS历史数据保存策略配置及统计 # 30 1 * * * cd /home/hadoop/scripts && bash keep_data_by_day.sh > /dev/null 2>&1 #set -e source /etc/profile declare -a keep_one_month=( "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_sn_path_list_df" "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_lbs_tencent_order_df" ...... ) declare -a keep_three_month=( "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_carlife_sdk_log_hi" "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_bind_manage_history_hf" ...... ) declare -a keep_six_month=( "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di" "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_iot_bizinfo_activesn_view_di" ...... ) declare -a keep_one_year=( "hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_feature_df" ...... ) LOGFILE="keep_data_by_day-`date '+%F_%T'`.log" techo() { local msg=$1 local msg_level=${2:-'INFO'} echo "`date '+%F %T'` - [${msg_level}] ${msg}" >> ${LOGFILE} } # 实现月份加减,接受两个参数:日期,操作的月数。日期格式为"yyyy-mm-dd",月数可以为负数,表示减操作。 plus_months() { date -d $1 +%F > /dev/null IFS=- read -r year month day <<< $1 num=$2 ((month+=num)) if [ $month -gt 12 ];then ((year+=month/12)) month=$((month%12)) else while [ $month -le 0 ];do ((month+=12)) ((year-=1)) done fi days=$(get_days_of_month $year $month) if [ $days -lt $day ];then day=$days fi date -d $year-$month-$day +%F } # 获取指定月份的天数,解决2月份28或29天造成日期不准的问题 get_days_of_month() { date_str=$(date -d "$1-$2-01 +1 month -1 day" +%F) date -d $date_str +%d } keep_data_by_month() { #local date_ago="`date -d "$1 days ago" "+%Y-%m-%d"`" local today=$(date +%F) local date_ago=$(plus_months $today $1) shift local path=("$@") for i in ${path[@]}; do echo hdfs dfs -rm -r -skipTrash $i/dt=$date_ago >> ${LOGFILE} hdfs dfs -rm -r -skipTrash $i/dt=$date_ago >> ${LOGFILE} done } # 统计/user/hive/warehouse/所有表数据量大小,上传cos count_warehouse_tablesize() { hdfs dfs -du hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/*/ | awk '{print strftime("%Y-%m-%d",systime())","$1","$2}' > warehhouse_tablesize_count_`date '+%F'`.csv hdfs dfs -put warehhouse_tablesize_count_`date '+%F'`.csv hdfs://anzhihe-hadoop-1234567/user/chegva_com/hive_storage/ } del_log() { # 删除一周前的日志记录文件 find . -type f -name "keep_data_by_day*.log" -mtime +6 -delete # 删除一周前的表数据统计文件 find . -type f -name "warehhouse_tablesize_count*.csv" -mtime +6 -delete } main() { techo "开始清理数据....." techo "删除一个月前分区数据!" keep_data_by_month -1 "${keep_one_month[@]}" techo "删除三个月前分区数据!" keep_data_by_month -3 "${keep_three_month[@]}" techo "删除六个月前分区数据!" keep_data_by_month -6 "${keep_six_month[@]}" techo "删除一年前分区数据!" keep_data_by_month -12 "${keep_one_year[@]}" techo "清理数据完毕。" techo "开始统计warehouse表空间大小..." count_warehouse_tablesize techo "统计warehouse表空间大小完毕。" del_log } main
查看效果:
2024-05-17 10:07:20 - [INFO] 开始清理数据..... 2024-05-17 10:07:20 - [INFO] 删除一个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/ods_anzhihe_sn_sdk_log_di/dt=2024-04-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/ods_anzhihe_sn_sdk_log_di/dt=2024-04-17 ...... 2024-05-17 10:07:20 - [INFO] 删除三个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_tencent_map_log_di/dt=2024-02-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_sn_tencent_map_log_di/dt=2024-02-17 ...... 2024-05-17 10:07:20 - [INFO] 删除六个月前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di/dt=2023-11-17 Deleted hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/chegva_com_lbs_path_detail_di/dt=2023-11-17 ...... 2024-05-17 10:07:20 - [INFO] 删除一年前分区数据! hdfs dfs -rm -r -skipTrash hdfs://anzhihe-hadoop-1234567/user/hive/warehouse/anzhihe.db/dw_anzhihe_sn_feature_df/dt=2023-05-17 ...... 2024-05-17 10:07:20 - [INFO] 开始统计warehouse表空间大小... 2024-05-17 10:07:20 - [INFO] 统计warehouse表空间大小完毕。
参考: