#! /bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2021 Oracle, Inc.  All Rights Reserved.
#
# FS QA Test No. 648
#
# Test nested log recovery with repeated (simulated) disk failures.  We kick
# off fsstress on a loopback filesystem mounted on the scratch fs, then switch
# out the underlying scratch device with dm-error to see what happens when the
# disk goes down.  Having taken down both fses in this manner, remount them and
# repeat.  This test simulates VM hosts crashing to try to shake out CoW bugs
# in writeback on the host that cause VM guests to fail to recover.
#
. ./common/preamble
_begin_fstest shutdown auto log metadata eio recoveryloop

_cleanup()
{
	_kill_fsstress
	if [ -n "$loopmnt" ]; then
		_unmount $loopmnt 2>/dev/null
		rm -r -f $loopmnt
	fi
	_dmerror_unmount
	_dmerror_cleanup
	cd /
	rm -f $tmp.*
}

# Import common functions.
. ./common/dmerror
. ./common/reflink

# Modify as appropriate.

_require_scratch_reflink
_require_cp_reflink
_require_dm_target error
_require_loop

echo "Silence is golden."

_scratch_mkfs >> $seqres.full 2>&1
_require_metadata_journaling $SCRATCH_DEV
_dmerror_init
_dmerror_mount

# Create a fs image consuming 1/3 of the scratch fs
scratch_freesp_bytes=$(_get_available_space $SCRATCH_MNT)
loopimg_bytes=$((scratch_freesp_bytes / 3))

loopimg=$SCRATCH_MNT/testfs
truncate -s $loopimg_bytes $loopimg
_mkfs_dev $loopimg

loopmnt=$tmp.mount
mkdir -p $loopmnt

scratch_aliveflag=$tmp.runsnap
snap_aliveflag=$tmp.snapping

snap_loop_fs() {
	touch "$snap_aliveflag"
	while [ -e "$scratch_aliveflag" ]; do
		rm -f $loopimg.a
		_cp_reflink $loopimg $loopimg.a
		sleep 1
	done
	rm -f "$snap_aliveflag"
}

while _soak_loop_running $((25 * TIME_FACTOR)); do
	touch $scratch_aliveflag
	snap_loop_fs >> $seqres.full 2>&1 &

	if ! _mount $loopimg $loopmnt -o loop; then
		rm -f $scratch_aliveflag
		_metadump_dev $loopimg $seqres.loop.$i.md
		_fail "iteration $SOAK_LOOPIDX loopimg mount failed"
		break
	fi

	_run_fsstress_bg -d "$loopmnt" -n 999999 -p "$((LOAD_FACTOR * 4))"

	# purposely include 0 second sleeps to test shutdown immediately after
	# recovery
	sleep $((RANDOM % (3 * TIME_FACTOR) ))
	rm -f $scratch_aliveflag

	# This test aims to simulate sudden disk failure, which means that we
	# do not want to quiesce the filesystem or otherwise give it a chance
	# to flush its logs.  Therefore we want to call dmsetup with the
	# --nolockfs parameter; to make this happen we must call the load
	# error table helper *without* 'lockfs'.
	_dmerror_load_error_table

	_kill_fsstress
	for ((j = 0; j < 10; j++)); do
		test -e "$snap_aliveflag" || break
		sleep 1
	done

	# Mount again to replay log after loading working table, so we have a
	# consistent fs after test.
	_unmount $loopmnt
	is_unmounted=1
	# We must unmount dmerror at here, or whole later testing will crash.
	# So try to umount enough times, before we have no choice.
	for ((j = 0; j < 100; j++)); do
		sleep 1
		_dmerror_unmount > $tmp.unmount.err 2>&1
		if [ $? -eq 0 ];then
			is_unmounted=0
			break
		fi
	done
	if [ $is_unmounted -ne 0 ];then
		cat $tmp.unmount.err
		_fail "iteration $SOAK_LOOPIDX scratch unmount failed"
	fi
	_dmerror_load_working_table
	if ! _dmerror_mount; then
		_metadump_dev $DMERROR_DEV $seqres.scratch.$i.md
		_fail "iteration $SOAK_LOOPIDX scratch mount failed"
	fi
done

# Make sure the fs image file is ok
if [ -f "$loopimg" ]; then
	if _mount $loopimg $loopmnt -o loop; then
		_unmount $loopmnt &> /dev/null
	else
		_metadump_dev $DMERROR_DEV $seqres.scratch.final.md
		echo "final scratch mount failed"
	fi
	SCRATCH_RTDEV= SCRATCH_LOGDEV= _check_scratch_fs $loopimg
fi

# success, all done; let the test harness check the scratch fs
status=0
exit
