Voxceleb源码阅读

# stage1&2 数据准备

. ./cmd.sh # 设置为本地运行
. ./path.sh # 设置需要的路径
set -e # exit on error
mfccdir=/home/ZhouWJ/ASV/voxceleb/mfcc
vaddir=/home/ZhouWJ/ASV/voxceleb/mfcc

# 路径参数
trials=/home/ZhouWJ/ASV/voxceleb# trials.txt的存放路径，在voxceleb总目录下
nnet_dir=/home/ZhouWJ/Code/voxceleb/exp/xvector_nnet_1a
wav=/home/public/Database/VPR/华东理工的数据/zdh/ai2+vox2+1505/wav# 测试数据的存放路径，需包括train,dev,test三个子文件夹

stage=1 #用于选择需要的步骤

1
2
3
4
5
6
7
8
9
10
11
12

查看shell脚本中设置的变量：如果是export命名的，则如：echo $train_cmd，但是，该变量只是针对该窗口有效；如果是直接赋值，则如：$train_cmd
set -e只用于命令行中
. ./cmd.sh 中的内容是export train_cmd="run.pl"

# stage1 调整格式

if [ $stage -le 1]; then
# 规范化uttID并生成trials文件
python3 tools/data_rename.py $wav# 音频ID前缀不是spkID时使用
python3 tools/create_trials.py $wav $trials
fi

1
2
3
4
5

# stage2 生成必要的表格

. ./path.sh || exit 1; # 左边执行失败，则执行右边（自行添加）

if [ $stage -le 2]; then
local/data_prep.sh $wav
echo "Stage-2 finish"
fi

1
2
3
4
5
6

对上述操作按以下命令单步执行

# local/data_prep.sh

aishell_data_prep.sh的用法：

Usage: aishell_data_prep.sh <audio-path> <text-path>
aishell_data_prep.sh /export/a05/xna/data/data_aishell/wav  \ /export/a05/xna/data/data_aishell/transcript

1
2

参照aishell文件夹中的aishell_data_prep.sh

#. ./path.sh || exit 1;# 左边执行失败，则执行右边

if [ $# != 1 ]; then
  echo "Usage: $0 <audio-path>"
  echo " $0 /export/a05/xna/data/data_aishell/wav "
  exit 1;
fi

audio_dir=$1

train_dir=/home/ZhouWJ/ASV/voxceleb/data/local/train
dev_dir=/home/ZhouWJ/ASV/voxceleb/data/local/dev
test_dir=/home/ZhouWJ/ASV/voxceleb/data/local/test

mkdir -p $train_dir
mkdir -p $dev_dir
mkdir -p $test_dir

# find wav audio file for train, dev and test resp.
# 之所以不直接进入{train, dev, test}的音频文件，可能是为了减少脚本间的参数传递
find $audio_dir -iname "*.wav" | grep -i "wav/train" > $train_dir/wav.flist || exit 1;
find $audio_dir -iname "*.wav" | grep -i "wav/dev" > $dev_dir/wav.flist || exit 1;
find $audio_dir -iname "*.wav" | grep -i "wav/test" > $test_dir/wav.flist || exit 1;

# 统计音频数量
n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l`
[ $n -ne 141925 ] && \
  echo Warning: expected 141925 data data files, found $n
  
# Transcriptions preparation
for dir in $train_dir $test_dir $dev_dir; do
  echo Preparing $dir transcriptions
  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{printf("%s\n", $NF)}' > $dir/utt.list
  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
  #utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt
  #awk '{print $1}' $dir/transcripts.txt | sort -u > $dir/utt.list
  # 找不存在的spkID以及对音频文件的去重
  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
  #sort -u $dir/transcripts.txt > $dir/text
  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
done

# 转移文件
mkdir -p /home/ZhouWJ/ASV/voxceleb/data/train /home/ZhouWJ/ASV/voxceleb/data/test /home/ZhouWJ/ASV/voxceleb/data/dev
for f in spk2utt utt2spk wav.scp ; do
  cp $train_dir/$f /home/ZhouWJ/ASV/voxceleb/data/train/$f || exit 1;
  cp $test_dir/$f /home/ZhouWJ/ASV/voxceleb/data/test/$f || exit 1;
  cp $test_dir/$f /home/ZhouWJ/ASV/voxceleb/data/dev/$f || exit 1;
done

echo "$0: Data preparation succeeded!"
exit 0;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

# utils/filter_scp.pl

参考资料：utils/filter_scp.pl脚本功能分析 (opens new window)

#Kaldi

上次更新: 2024/04/10, 22:12:29

← iVector相关代码详解 Kaldi相关项目及碎片知识→