tensor_predictors/dataAnalysis/chess/Rchess/src/data_gen.cpp

138 lines
4.4 KiB
C++

#include <iostream>
#include <iomanip>
#include <string>
#include <fstream>
#include <sstream>
#include <limits>
#include <Rcpp.h>
#include "SchachHoernchen/Board.h"
//' Specialized version of `read_cyclic.cpp` taylored to work in conjunction with
//' `gmlm_chess()` as data generator to provide random draws from a FEN data set
//' with scores filtered to be in in the range `score_min` to `score_max`.
//'
// [[Rcpp::export(name = "data.gen", rng = true)]]
Rcpp::CharacterVector data_gen(
const std::string& file,
const int sample_size,
const float score_min = -5.0,
const float score_max = +5.0,
const bool quiet = false,
const int min_ply_count = 10,
const bool white_only = true
) {
// Check parames
if (sample_size < 1) {
Rcpp::stop("`sample_size` must be positive");
}
if (score_min >= score_max) {
Rcpp::stop("`score_min` must be strictly smaller than `score_max`");
}
// open FEN data set file
std::ifstream input(file);
if (!input) {
Rcpp::stop("Opening file '%s' failed", file);
}
// set the read from stream position to a random line
input.seekg(0, std::ios::end);
unsigned long seek = unif_rand() * input.tellg();
input.seekg(seek);
// from random position set stream position to line start (if not over shot)
if (!input.eof()) {
input.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
}
// Ensure (in any case) we are at a legal position (recycle)
if (input.eof()) {
input.seekg(0);
}
// Allocate output sample
Rcpp::CharacterVector _sample(sample_size);
Rcpp::NumericVector _scores(sample_size);
// Read and filter lines from FEN data base file
std::string line, fen;
float score;
Board pos;
int sample_count = 0, retry_count = 0, reject_count = 0;
while (sample_count < sample_size) {
// Check for user interupt (that is, allows from `R` to interupt execution)
R_CheckUserInterrupt();
// Avoid infinite loop
if (reject_count > 1000 * sample_size) {
Rcpp::stop("Too many rejections, stop to avoid infinite loop");
}
// Read line, in case of failure retry from start of file (recycling)
if (!std::getline(input, line)) {
input.clear();
input.seekg(0);
if (!std::getline(input, line)) {
// another failur is fatal
Rcpp::stop("Recycline lines in file '%s' failed", file);
}
}
// Check for empty line, treated as a partial error which we retry a few times
if (line.empty()) {
if (++retry_count > 10) {
Rcpp::stop("Retry count exceeded after reading empty line in '%s'", file);
} else {
continue;
}
}
// Split candidat line into FEN and score
std::stringstream candidat(line);
std::getline(candidat, fen, ';');
candidat >> score;
if (candidat.fail()) {
// If this failes, the FEN data base is ill formed!
Rcpp::stop("Ill formated FEN data base file '%s'", file);
}
// parse FEN to filter only positions with white to move
bool parseError = false;
pos.init(fen, parseError);
if (parseError) {
Rcpp::stop("Retry count exceeded after illegal FEN '%s'", fen);
}
// Reject / Filter samples
if (((int)pos.plyCount() < min_ply_count) // early positions
|| (white_only && (pos.sideToMove() == piece::black)) // white to move positions
|| (score < score_min || score_max <= score) // scores out of slice
|| (quiet && !pos.isQuiet())) // quiet positions
{
reject_count++;
continue;
}
// Everythings succeeded and ge got an appropriate sample in requested range
_sample[sample_count] = fen;
_scores[sample_count] = score;
++sample_count;
// skip lines (ensures independent draws based on games being independent)
if (input.eof()) {
input.seekg(0);
}
for (int s = 0; s < 256; ++s) {
input.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
if (input.eof()) {
input.seekg(0);
}
}
}
// Set scores as attribute to position sample
_sample.attr("scores") = _scores;
return _sample;
}