The crate provides the port of the original BERT createpretrainingdata.py script from the Google BERT repository.
bash
find "${DATA_DIR}" -name "*.txt" | xargs -I% -P $NUM_PROC -n 1 \
basename % | xargs -I% -P ${NUM_PROC} -n 1 \
"${TARGET_DIR}/bert2d_create_pretraining" \
--input-file="${DATA_DIR}/%" \
--output-file="${OUTPUT_DIR}/%.tfrecord" \
--vocab-file="${VOCAB_DIR}/vocab.txt" \
--max-seq-length=512 \
--max-predictions-per-seq=75 \
--masked-lm-prob=0.15 \
--random-seed=12345 \
--dupe-factor=5
MIT license. See LICENSE file for full license.