tensor_predictors/dataAnalysis/chess/preprocessing.sh

32 lines
1.3 KiB
Bash
Executable File

#!/bin/bash
# Data set name: Chess games from the Lichess Data Base for standard rated games
# in November 2023
data=lichess_db_standard_rated_2023-11
# Check if file exists and download iff not
if [ -f "${data}.fen" ]; then
echo "File '${data}.fen' already exists, assuming job already done."
echo "To rerun delete (rename) the files '${data}.pgn.zst' and/or '${data}.fen'"
else
# First, compile `png2fen`
make pgn2fen
# Download the PGN data base via `wegt` if not found.
# The flag `-q` suppresses `wget`s own output and `-O-` tells `wget` to
# stream the downloaded file to `stdout`.
# Otherwise, use the file on disk directly.
# Decompress the stream with `zstdcat` (no temporary files)
# The uncompressed PGN data is then piped into `pgn2fen` which converts
# the PGN data base into a list of FEN strings while filtering only
# positions with evaluation. The `--scored` parameter specifies to extract
# a position evaluation from the PGN and ONLY write positions with scores.
# That is, positions without a score are removed!
if [ -f "${data}.pgn.zst" ]; then
zstdcat ${data}.pgn.zst | ./pgn2fen --scored > ${data}.fen
else
wget -qO- https://database.lichess.org/standard/${data}.pgn.zst \
| zstdcat | ./pgn2fen --scored > ${data}.fen
fi
fi