# -*- coding: utf-8 -*-

###############################################################################
## Create a 10-fold cross-validation sample                                   #
## Essentially it is exactly the same code as in William Turkel's Naive       #
## ... Bayesian posts, with only a few minor changes for consistency with     # 
## ... the other code snippets.                                               #
## Note: this code can be re-run if the code in count-offense-instances shows #
## ... a very uneven distribution of the offense category across samples.     #
###############################################################################
import os, random

# Preliminaries: output directory, file where file ids for data reside
outdirname = '../baileyfiles/Samples_1830s/'
if os.path.exists(outdirname) == 0: 
    os.mkdir(outdirname)
idfile = '../baileyfiles/trialids.txt'

# Get a list of trials
with open(idfile, 'r') as f0:
    triallist = [line.strip() for line in f0]

# Shuffle it, changing list in place
random.shuffle(triallist)

# Do floor division to get basic sample size and remainder
numtrials = len(triallist)
samplesize = numtrials // 10
base = samplesize * 10
remainder = numtrials - base
print "Trials: %d; Base sample: %d; Remainder: %d" % (numtrials, samplesize, remainder)

# Get basic samples
# This creates a dictionary that contains sample id (0-9) as key and the list
# ... of trials belonging to that sample as values
sample = {}
for i in range(0,10):
    index = i * samplesize
    offset = index + samplesize
    sample[i] = triallist[index:offset]

# Distribute remainder as equally as possible
tailend = range(base, base+remainder)
i = 0
for t in tailend:
    sample[i].append(triallist[t])
    i += 1

# Write samples to files
for k in sample.keys():
    outfilename = outdirname + 'sample' + str(k) + '.txt'
    with open(outfilename, 'w') as f1:
        for tr in sample[k]: 
            line = tr + '\n'
            f1.write(str(line))


