Voxceleb源码阅读
# stage1&2 数据准备
. ./cmd.sh # 设置为本地运行
. ./path.sh # 设置需要的路径
set -e # exit on error
mfccdir=/home/ZhouWJ/ASV/voxceleb/mfcc
vaddir=/home/ZhouWJ/ASV/voxceleb/mfcc
# 路径参数
trials=/home/ZhouWJ/ASV/voxceleb# trials.txt的存放路径,在voxceleb总目录下
nnet_dir=/home/ZhouWJ/Code/voxceleb/exp/xvector_nnet_1a
wav=/home/public/Database/VPR/华东理工的数据/zdh/ai2+vox2+1505/wav# 测试数据的存放路径,需包括train,dev,test三个子文件夹
stage=1 #用于选择需要的步骤
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
- 查看shell脚本中设置的变量:如果是export命名的,则如:
echo $train_cmd
,但是,该变量只是针对该窗口有效;如果是直接赋值,则如:$train_cmd
set -e
只用于命令行中. ./cmd.sh
中的内容是export train_cmd="run.pl"
# stage1 调整格式
if [ $stage -le 1]; then
# 规范化uttID并生成trials文件
python3 tools/data_rename.py $wav# 音频ID前缀不是spkID时使用
python3 tools/create_trials.py $wav $trials
fi
1
2
3
4
5
2
3
4
5
# stage2 生成必要的表格
. ./path.sh || exit 1; # 左边执行失败,则执行右边(自行添加)
if [ $stage -le 2]; then
local/data_prep.sh $wav
echo "Stage-2 finish"
fi
1
2
3
4
5
6
2
3
4
5
6
- 对上述操作按以下命令单步执行
# local/data_prep.sh
aishell_data_prep.sh的用法:
Usage: aishell_data_prep.sh <audio-path> <text-path>
aishell_data_prep.sh /export/a05/xna/data/data_aishell/wav \ /export/a05/xna/data/data_aishell/transcript
1
2
2
参照aishell文件夹中的aishell_data_prep.sh
#. ./path.sh || exit 1;# 左边执行失败,则执行右边
if [ $# != 1 ]; then
echo "Usage: $0 <audio-path>"
echo " $0 /export/a05/xna/data/data_aishell/wav "
exit 1;
fi
audio_dir=$1
train_dir=/home/ZhouWJ/ASV/voxceleb/data/local/train
dev_dir=/home/ZhouWJ/ASV/voxceleb/data/local/dev
test_dir=/home/ZhouWJ/ASV/voxceleb/data/local/test
mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir
# find wav audio file for train, dev and test resp.
# 之所以不直接进入{train, dev, test}的音频文件,可能是为了减少脚本间的参数传递
find $audio_dir -iname "*.wav" | grep -i "wav/train" > $train_dir/wav.flist || exit 1;
find $audio_dir -iname "*.wav" | grep -i "wav/dev" > $dev_dir/wav.flist || exit 1;
find $audio_dir -iname "*.wav" | grep -i "wav/test" > $test_dir/wav.flist || exit 1;
# 统计音频数量
n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l`
[ $n -ne 141925 ] && \
echo Warning: expected 141925 data data files, found $n
# Transcriptions preparation
for dir in $train_dir $test_dir $dev_dir; do
echo Preparing $dir transcriptions
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{printf("%s\n", $NF)}' > $dir/utt.list
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
#utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt
#awk '{print $1}' $dir/transcripts.txt | sort -u > $dir/utt.list
# 找不存在的spkID以及对音频文件的去重
utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
#sort -u $dir/transcripts.txt > $dir/text
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done
# 转移文件
mkdir -p /home/ZhouWJ/ASV/voxceleb/data/train /home/ZhouWJ/ASV/voxceleb/data/test /home/ZhouWJ/ASV/voxceleb/data/dev
for f in spk2utt utt2spk wav.scp ; do
cp $train_dir/$f /home/ZhouWJ/ASV/voxceleb/data/train/$f || exit 1;
cp $test_dir/$f /home/ZhouWJ/ASV/voxceleb/data/test/$f || exit 1;
cp $test_dir/$f /home/ZhouWJ/ASV/voxceleb/data/dev/$f || exit 1;
done
echo "$0: Data preparation succeeded!"
exit 0;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# utils/filter_scp.pl
上次更新: 2024/04/10, 22:12:29