diff --git a/main.py b/main.py
index ce3484ec53085879eb9b6cda7f11b8b6ca928854..6c5311a12680202d2ffbc9412539799b206153fe 100644
--- a/main.py
+++ b/main.py
@@ -40,12 +40,13 @@ def train_one_epoch(danceformer, train_loader, epoch, model_params_path, device,
     num_iters = len(train_loader)
     initial_start_time = time.time()
     empty_token = danceformer.indexed_vocabulary.get("0000")
-    greater_ratio_values_bool = False
+    #greater_ratio_values_bool = False
 
     # Run training
     # for each song, get 3 spectrograms and process them through cnn, mlp (difficulty) and transformer
     for i, data in enumerate(train_loader):
         #elapsed_second_step = time.time() - epoch_start_time
+        greater_ratio_values_bool = False
         now = datetime.now()
         iter_num = i % num_iters
         elapsed_time = time.time() - initial_start_time
@@ -97,7 +98,9 @@ def train_one_epoch(danceformer, train_loader, epoch, model_params_path, device,
         #predictions = torch.argmax(output, dim=1)
         #predictions = max_values.indices
         output_flat = output.view(-1, danceformer.ntokens).to(device)
-        charts_flat = charts.view(-1).to(device)
+        charts = charts[:, 1:]
+        #charts_flat = charts.view(-1).to(device)
+        charts_flat = charts.reshape(charts.shape[0] * charts.shape[1])
         # check how probability per token develops
         #outputs.append(output_flat)
         #outputs_softmax.append(torch.nn.Softmax(output_flat))
@@ -162,9 +165,11 @@ def train_one_epoch(danceformer, train_loader, epoch, model_params_path, device,
         empty_check = torch.where(predictions != empty_token, 1.0, 0.0)
         #print(empty_check)
         #print(f"Number of non empty chart entries: {len(torch.nonzero(empty_check))}")
+        # charts[:, 1:]
         loss = danceformer.criterion(output_flat, charts_flat)
         # leave out empty tokens, just for evaluation, not for backpropagation
-        loss_v_2 = danceformer.criterion(output_wo_empty, charts_wo_empty)
+        with torch.no_grad():
+            loss_v_2 = danceformer.criterion(output_wo_empty, charts_wo_empty)
 
 
 
@@ -384,22 +389,28 @@ def custom_collate_fn(batch, max_len_specs, vocabulary, folder, specs_folder, de
         # charts: need to take appropriate window of chart
         # have intervals of 0.125 sec (range_dist) with criteria time_step > timed_note.time >= time_step - 0.125
         # interval ranges -> example [0,0.125)
+        # we have a start token! +1 token always shifted
         range_dist = 1 / 64  # 1/8
 
         # take sublist of start to max_len_specs, need to transform start to window of chart
         # chart length too small, possible??
 
         # if chart not too short, take sublist of notes
-        start_time = int(start / range_dist)
+        # start in seconds /range dist same as *64, gives right interval, but +1(start token, always shifted)
+        start_time = int(start / range_dist)+1
         #print(f"Start time: {start_time}")
         range_time = max_len_specs / range_dist
         range_end = int(start_time + range_time)
         #print(f"Range end: {range_end}")
         # end out of range, because chart already ended...
         if range_end > len(song_chart):
+            # take end chart token
+            end_token_tmp = song_chart[-1]
+            song_chart = song_chart[:-1]
             # add empty chart token
-            while range_end != len(song_chart):
+            while range_end != len(song_chart)-1:
                 song_chart.append(indexed_vocabulary.get("0000"))
+            song_chart.append(end_token_tmp)
         #print(f"Length new chart: {len(song_chart[start_time:range_end])}")
         # save and to tensor
         # rang end must be inside chart length
@@ -434,73 +445,86 @@ def custom_collate_fn(batch, max_len_specs, vocabulary, folder, specs_folder, de
 
 def setup_parser():
     out = argparse.ArgumentParser()
-    # cluster version
+    # # # cluster version
     out.add_argument('--get_vocabulary', default=None, type=str, help="Path to load vocabulary")
-    out.add_argument('--save_vocabulary', default=r'/scratch/grzonkow/2702/vocabulary.pkl',
+    out.add_argument('--save_vocabulary', default=r'/scratch/grzonkow/2902/vocabulary.pkl',
                      type=str, help="Path to safe vocabulary")
     out.add_argument('--dataset', default=None, type=str, help="Path to dataset")
     out.add_argument('--built_dataset', default=None, type=str, help="Path to dataset folder")
-    out.add_argument('--safe_dataset', default='/scratch/grzonkow/2702/train_dataset_with_mel_specs.pkl',
+    out.add_argument('--safe_dataset', default='/scratch/grzonkow/2902/train_dataset_2902.pkl',
                      type=str,
                      help="Path to safe dataset")
+    out.add_argument('--old_version',
+                     default=False,
+                     type=str, help="Indicates if old version without start end tokens need to be loaded")
     out.add_argument('--folder', default=r'/work/MLShare/StepMania/data/cleaned/allowed_meter_difference_of_2/',
                      type=str, help="Path to folder containing packages with music and sm files")
     out.add_argument('--model_params_path',
-                     default=r'/scratch/grzonkow/2602/models/model_params_2702_100_batch_size',
+                     default=r'/scratch/grzonkow/2902/models/model_params_2702_100_batch_size',
                      type=str, help="Path to safe model parameters")
     out.add_argument('--load_model_params_path', default=None, type=str, help="Path to load model parameters")
-    out.add_argument('--safe_dir', default="/scratch/grzonkow/2702/output",
+    out.add_argument('--safe_dir', default="/scratch/grzonkow/2902/output",
                      type=str, help="Path to safe dataset points")
-    out.add_argument('--specs_folder', default="/scratch/grzonkow/2702/dataset/",
+    out.add_argument('--specs_folder', default="/scratch/grzonkow/2902/dataset/",
                      type=str, help="Path to safe dataset points")
-    out.add_argument('--problem_dir', default="/scratch/grzonkow/2702/details/",
+    out.add_argument('--problem_dir', default="/scratch/grzonkow/2902/details/",
                      type=str, help="Path to safe problems")
-    out.add_argument('--model_params_path_nan_problem', default="/scratch/grzonkow/2702/nan_problem/",
+    out.add_argument('--model_params_path_nan_problem', default="/scratch/grzonkow/2902/nan_problem/",
                      type=str, help="Path to safe problems")
-    #
-    # #
+
+
+
     #local version
-    # out.add_argument('--get_vocabulary', default=None,
-    #                  type=str, help="Path to load vocabulary")
-    # out.add_argument('--save_vocabulary', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/vocabulary.pkl',
+    # #out.add_argument('--get_vocabulary', default=None,
+    # #                type=str, help="Path to load vocabulary")
+    # out.add_argument('--get_vocabulary',
+    #                  default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/vocabulary.pkl',
+    #                   type=str, help="Path to load vocabulary")
+    # out.add_argument('--save_vocabulary', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/vocabulary.pkl',
     #                  type=str, help="Path to safe vocabulary")
-    # out.add_argument('--dataset', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/train_dataset_latest.pkl",
-    #                 type=str, help="Path to dataset")
-    # #out.add_argument('--dataset', default=None, type=str, help="Path to dataset")
+    # #out.add_argument('--dataset', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/train_dataset_latest.pkl",
+    # #                type=str, help="Path to dataset")
+    # out.add_argument('--dataset', default='C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/train_dataset_latest.pkl', type=str, help="Path to dataset")
     # #out.add_argument('--built_dataset', default=None, type=str, help="Path to dataset folder")
     # out.add_argument('--built_dataset', default=None, type=str, help="Path to dataset folder")
     # out.add_argument('--safe_dataset',
-    #                  default='C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/train_dataset_latest.pkl',
+    #                  default='C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/train_dataset_latest.pkl',
     #                  type=str, help="Path to safe dataset")
     # out.add_argument('--folder', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_pack/',
     #                  type=str, help="Path to folder containing packages with music and sm files")
     # out.add_argument('--model_params_path',
-    #                  default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/model_params',
+    #                  default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/model_params',
     #                  type=str, help="Path to safe model parameters")
     # out.add_argument('--load_model_params_path', default=None, type=str, help="Path to load model parameters")
     # #out.add_argument('--load_model_params_path', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/new model/model_params_2302.pt',
     # #                 type=str, help="Path to load model parameters")
     # #out.add_argument('--safe_dir', default=None, type=str, help="Path to dataset")
-    # out.add_argument('--safe_dir', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/output",
+    # out.add_argument('--safe_dir', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/output",
     #                  type=str, help="Path to safe dataset points")
-    # out.add_argument('--specs_folder', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/copy_dataset/",
+    # out.add_argument('--specs_folder', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/copy_dataset/",
     #                  type=str, help="Path to safe dataset points")
-    # out.add_argument('--problem_dir', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/details/",
+    # out.add_argument('--problem_dir', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/details/",
     #                   type=str, help="Path to safe dataset points")
-    # out.add_argument('--model_params_path_nan_problem', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/nan_problem/",
+    # out.add_argument('--model_params_path_nan_problem', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/nan_problem/",
     #                  type=str, help="Path to safe problems")
 
 
-    # local version
-    # out.add_argument('--get_vocabulary', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/vocabulary.pkl',
+    # #local version
+    #out.add_argument('--get_vocabulary',
+    #                 default=None,
+    #                 type=str, help="Path to load vocabulary")
+    # out.add_argument('--get_vocabulary', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/cluster/vocabulary.pkl',
     #                  type=str, help="Path to load vocabulary")
     # out.add_argument('--save_vocabulary', default=r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/vocabulary_new.pkl',
     #                  type=str, help="Path to safe vocabulary")
     # #out.add_argument('--dataset', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/train_dataset_latest.pkl",
     # #                type=str, help="Path to dataset")
-    # out.add_argument('--dataset', default=None, type=str, help="Path to dataset")
-    # #out.add_argument('--built_dataset', default=None, type=str, help="Path to dataset folder")
-    # out.add_argument('--built_dataset', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/output", type=str, help="Path to dataset folder")
+    # out.add_argument('--old_version',
+    #                  default=False,
+    #                  type=str, help="Indicates if old version without start end tokens need to be loaded")
+    # out.add_argument('--dataset', default='C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/train_dataset_latest.pkl', type=str, help="Path to dataset")
+    # out.add_argument('--built_dataset', default=None, type=str, help="Path to dataset folder")
+    # #out.add_argument('--built_dataset', default="C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/output", type=str, help="Path to dataset folder")
     # out.add_argument('--safe_dataset',
     #                  default='C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/train_dataset_latest.pkl',
     #                  type=str, help="Path to safe dataset")
@@ -534,7 +558,9 @@ if __name__ == '__main__':
     pkgs = os.listdir(folder)
     os.makedirs(args.model_params_path_nan_problem, exist_ok=True)
     os.makedirs(args.problem_dir, exist_ok=True)
-    os.makedirs("/scratch/grzonkow/2702/models/", exist_ok=True)
+    os.makedirs("/scratch/grzonkow/2902/models/", exist_ok=True)
+    #os.makedirs("C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/models/", exist_ok=True)
+    old_version = args.old_version
     #best_model_params_path = os.path.join(tempdir, "best_model_params.pt")
     #model_params_path = r'/scratch/grzonkow/model_params.pt'
     model_params_path = args.model_params_path
@@ -557,7 +583,17 @@ if __name__ == '__main__':
     # preprocessing
     if args.get_vocabulary is None:
         indexed_vocabulary = built_chart_vocabulary(songs)
-        # save vocabulary for reusing
+        if old_version:
+            # need to add start and endtoken
+            # Get the last value entry or len(indexed_vocabulary)-1)
+            last_value = list(indexed_vocabulary.values())[-1]
+            start_token = "<s>"
+            end_token = "<e>"
+            indexed_vocabulary[start_token] = last_value+1
+            indexed_vocabulary[end_token] = last_value+2
+            print(f"With start and end token: \n{indexed_vocabulary}")
+            print(f"Length updated vocabulary: \n{len(indexed_vocabulary)}")
+            # save vocabulary for reusing
         with open(args.save_vocabulary, 'wb') as fp:
             pickle.dump(indexed_vocabulary, fp)
             print('dictionary saved successfully to file')
@@ -567,6 +603,20 @@ if __name__ == '__main__':
             indexed_vocabulary = pickle.load(fp)
             print('Loaded dictionary:')
             print(indexed_vocabulary)
+        # only once, if old vocab
+        if old_version:
+            # Get the last value entry or len(indexed_vocabulary)-1)
+            last_value = list(indexed_vocabulary.values())[-1]
+            start_token = "<s>"
+            end_token = "<e>"
+            indexed_vocabulary[start_token] = last_value + 1
+            indexed_vocabulary[end_token] = last_value + 2
+            print(f"With start and end token: \n{indexed_vocabulary}")
+            print(f"Length updated vocabulary: \n{len(indexed_vocabulary)}")
+            # save vocabulary for reusing
+            with open(args.save_vocabulary, 'wb') as fp:
+                pickle.dump(indexed_vocabulary, fp)
+                print('dictionary saved successfully to file')
     print("--------------------------------------------------------------------------------- \n")
 
 
@@ -597,6 +647,35 @@ if __name__ == '__main__':
     start_time_first_step = time.time()
 
     if args.built_dataset:
+        # once, because added start and end tokens...
+        # empty_token = indexed_vocabulary.get("0000")
+        # alternativ vorletzter und letzter vocab eintrag, value
+        if old_version:
+            start_token = indexed_vocabulary.get("<s>")
+            end_token = indexed_vocabulary.get("<e>")
+
+            path_to_charts = f'{args.built_dataset}/all_charts/'
+            for idx in range(0, len(os.listdir(path_to_charts))):
+                with open(f"{path_to_charts}{idx}.pkl", 'rb') as fp:
+                    # with open(f'/scratch/grzonkow/vocabulary.pkl', 'rb') as fp:
+                    new_song_chart = []
+                    new_song_chart.append(start_token)
+                    song_chart = pickle.load(fp)
+                    new_song_chart.extend(song_chart)
+                    # update with start and end token
+                    new_song_chart.append(end_token)
+                # save again
+                with open(f"{path_to_charts}{idx}.pkl", 'wb') as fp:
+                    pickle.dump(new_song_chart, fp)
+
+        # test one file
+        #with open(f"{path_to_charts}0.pkl", 'rb') as fp:
+        #    # with open(f'/scratch/grzonkow/vocabulary.pkl', 'rb') as fp:
+        #    song_chart = pickle.load(fp)
+        #    print(song_chart)
+        #    print(song_chart[-1])
+
+
         # in ms
         win_length_list_ms = [23, 46, 93]
         for win in win_length_list_ms:
diff --git a/model_run.sbatch b/model_run.sbatch
index 8b1ad13a375b395c4b786449e218b7e8658d0e9a..8e633ad8a696eceea53e6243fe4a28f7f9833a19 100644
--- a/model_run.sbatch
+++ b/model_run.sbatch
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH -t 7-00:00:00                    # time limit set to 1 week, 1 day 1-00:00:00
-#SBATCH -J Model_2702_1                  # the job name
+#SBATCH -J Model_2902_1                  # the job name
 #SBATCH --mail-type=END,FAIL,TIME_LIMIT  # send notification emails
 #SBATCH -n 5                             # use 5 tasks
 #SBATCH --cpus-per-task=1                # use 1 thread per taks
@@ -9,8 +9,8 @@
 #SBATCH --partition=informatik-mind
 #SBATCH --gpus=1                         # request 1 GPU
 #SBATCH --gpu_cmode=shared               # Set the GPU into shared mode, so that multiple processes can run on it
-#SBATCH --output=/scratch/grzonkow/2702/model_100_batch_size.txt         # capture output
-#SBATCH --error=/scratch/grzonkow/2702/err_100_batch_size.txt          # and error streams
+#SBATCH --output=/scratch/grzonkow/2902/model_100_batch_size.txt         # capture output
+#SBATCH --error=/scratch/grzonkow/2902/err_100_batch_size.txt          # and error streams
 
 
 
@@ -22,7 +22,7 @@ unset LD_LIBRARY_PATH
 #pip install -r requirements.txt
 #pip install simfile
 #python -u main.py --get_vocabulary /scratch/grzonkow/vocabulary.pkl --dataset /scratch/grzonkow/train_dataset_latest.pkl --processes $SLURM_NTASKS --threads $SLURM_CPUS_PER_TASK "$@"
-python -u main.py --get_vocabulary /scratch/grzonkow/vocabulary.pkl --built_dataset /scratch/grzonkow/output --processes $SLURM_NTASKS --threads $SLURM_CPUS_PER_TASK "$@"
+python -u main.py --get_vocabulary /scratch/grzonkow/vocabulary.pkl --built_dataset /scratch/grzonkow/output --old_version True --processes $SLURM_NTASKS --threads $SLURM_CPUS_PER_TASK "$@"
 #python -u main.py --get_vocabulary /scratch/grzonkow/vocabulary.pkl --processes $SLURM_NTASKS --threads $SLURM_CPUS_PER_TASK "$@"
 #python -u main.py --processes $SLURM_NTASKS --threads $SLURM_CPUS_PER_TASK "$@"
 conda deactivate
diff --git a/models/transformer.py b/models/transformer.py
index 7d8bb718ed54f5b0b99d4793a17c2b3ce5006d35..8fc1f897a2d02715ce319b598d038114d4f22e76 100644
--- a/models/transformer.py
+++ b/models/transformer.py
@@ -174,7 +174,7 @@ class TransformerModel(nn.Module):
             """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
             Unmasked positions are filled with float(0.0).
             """
-            tgt_mask = nn.Transformer.generate_square_subsequent_mask(charts.shape[1]).to(device)
+            tgt_mask = nn.Transformer.generate_square_subsequent_mask(charts.shape[1]-1).to(device)
 
         # out = transformer_decoder(tgt, memory),
         # tgt (Tensor) – the sequence to the decoder (required).
@@ -191,7 +191,7 @@ class TransformerModel(nn.Module):
         #print(f"Tgt mask: {tgt_mask.shape}")
 
         # tgt, memory, tgt_mask=None
-        out = self.transformer_decoder(charts, comb, tgt_mask).to(device)
+        out = self.transformer_decoder(charts[:, :-1], comb, tgt_mask).to(device)
         #print(f"Out transformer decoder shape: {out.shape}")
         out = self.linear(out).to(device)
         #print(f"Output linear: {output}")
@@ -243,8 +243,9 @@ class Danceformer(nn.Module):
 
         # Read dictionary pkl file
         # args?
-        #with open(r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/new model/vocabulary.pkl', 'rb') as fp:
-        with open(r'/scratch/grzonkow/vocabulary.pkl', 'rb') as fp:
+        #with open(r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/2902_new_model_voc/vocabulary.pkl', 'rb') as fp:
+        #with open(r'C:/Users/cassi/OneDrive/Desktop/Master_Thesis/test_cluster/vocabulary_new.pkl', 'rb') as fp:
+        with open(r'/scratch/grzonkow/2902/vocabulary.pkl', 'rb') as fp:
             indexed_vocabulary = pickle.load(fp)
             #print('Loaded dictionary:')
             #print(indexed_vocabulary)
diff --git a/preprocess.py b/preprocess.py
index bb928fa60890085e3bb58fc1cfbea856336c17b4..7521a9baff568bd1bddab025426357cbfc38fdcf 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -125,6 +125,11 @@ def get_paths_diff_charts(safe_dir, song_dirs, indexed_vocabulary, device):
     num_files = 0
     # in ms
     win_length_list_ms = [23, 46, 93]
+    #empty_token = indexed_vocabulary.get("0000")
+    # alternativ vorletzter und letzter vocab eintrag, value
+    #start_token = indexed_vocabulary.get("<s>")
+    #end_token = indexed_vocabulary.get("<e>")
+
 
     for idx, song_dir in enumerate(song_dirs):
         # safe per song, then write to file, and new song
@@ -358,6 +363,11 @@ def process_chart(openedfile, test_file, chart, indexed_vocabulary, device):
     note_data = NoteData(chart)
     cols = note_data.columns
 
+    # empty_token = indexed_vocabulary.get("0000")
+    # alternativ vorletzter und letzter vocab eintrag, value
+    start_token = indexed_vocabulary.get("<s>")
+    end_token = indexed_vocabulary.get("<e>")
+
     # timing_data_test = TimingData(openedfile)
     # Converting song time to beats
     # engine = TimingEngine(timing_data_test)
@@ -423,6 +433,7 @@ def process_chart(openedfile, test_file, chart, indexed_vocabulary, device):
     # notes_in_range = [note_string] * len(np.arange(0, tmp[-1].key(), range_dist))
     # notes_in_range = dict()
     notes_in_range = []
+    notes_in_range.append(start_token)
 
     # print(list(tmp.keys())[-1])
     # count_more_notes = 0
@@ -445,6 +456,9 @@ def process_chart(openedfile, test_file, chart, indexed_vocabulary, device):
             # notes_in_range[time_step] = indexed_vocabulary[res[0]]
             notes_in_range.append(indexed_vocabulary[res[0]])
 
+    # take care of append 0 case in loading data random
+    notes_in_range.append(end_token)
+
     # print(f"Number counted more notes in interval: {count_more_notes}")
     # if time_step > tmp.keys() >= time_step - range_dist:
     #         notes_per_range.append(timed_note.note)