From 1a0b3671286277c680e9d2e537a5216f8c0760e7 Mon Sep 17 00:00:00 2001
From: Saida Yusupova <yusupova@rhrk.uni-kl.de>
Date: Thu, 15 Dec 2022 02:25:29 +0100
Subject: [PATCH] dvae to etm format change

---
 dvae_to_etm.py   | 124 +++++++++++++++++++++++++++++++++++++++++++++++
 run_dvae_20ng.sh |  22 +++++++++
 run_dvae_wiki.sh |  22 +++++++++
 run_etm_20ng.sh  |  26 ++++++++++
 4 files changed, 194 insertions(+)
 create mode 100644 dvae_to_etm.py
 create mode 100644 run_dvae_20ng.sh
 create mode 100644 run_dvae_wiki.sh
 create mode 100644 run_etm_20ng.sh

diff --git a/dvae_to_etm.py b/dvae_to_etm.py
new file mode 100644
index 0000000..e2def4c
--- /dev/null
+++ b/dvae_to_etm.py
@@ -0,0 +1,124 @@
+import os
+import json
+import pickle
+import random
+import numpy as np
+from scipy import sparse
+from pathlib import Path
+from scipy.io import savemat, loadmat
+
+infile_path = './data/wikitext/processed_wiki/'
+outfile_path = './data/wikitext/processed_wiki/etm-format/'
+wiki_train_dtm = sparse.load_npz(Path(infile_path, "train.dtm.npz"))
+wiki_val_dtm = sparse.load_npz(Path(infile_path, "val.dtm.npz"))
+wiki_ts_dtm = sparse.load_npz(Path(infile_path, "ts.dtm.npz"))
+
+wiki_full_dtm = sparse.load_npz(Path(infile_path, "full.dtm.npz"))
+wiki_full_filtered_dtm = sparse.load_npz(Path(infile_path, "full_filtered.dtm.npz"))
+wiki_tr_va_filtered_dtm = sparse.load_npz(Path(infile_path, "full_train_val_filtered.dtm.npz"))
+
+def create_val_dtm():
+    # prepate val.dtm.npz file
+    wiki_val_size = 4200
+    rand_ind = np.random.randint(0, wiki_full_filtered_dtm.shape[0], wiki_val_size)
+    wiki_val = wiki_full_filtered_dtm[rand_ind, :]
+    # save val set
+    print("saving val set...")
+    sparse.save_npz(Path(infile_path, "val.dtm.npz"), wiki_val)
+
+def create_ts_dtm():
+    # prepate val.dtm.npz file
+    wiki_ts_size = 4200
+    rand_ind = np.random.randint(0, wiki_tr_va_filtered_dtm.shape[0], wiki_ts_size)
+    wiki_ts = wiki_tr_va_filtered_dtm[rand_ind, :]
+    # save val set
+    print("saving val set...")
+    sparse.save_npz(Path(infile_path, "ts.dtm.npz"), wiki_ts)
+
+def json_to_pkl():
+    # Converting json file into pickle file
+    print('converting json file to pickle file...')
+    # read json file
+    with open(infile_path + 'vocab.json', 'r') as infile:
+        json_obj = json.load(infile)
+    pkl_obj = pickle.loads(pickle.dumps(json_obj))
+    vocab = list(pkl_obj.keys())
+
+    # write into pkl file
+    if not os.path.isdir(outfile_path):
+        os.system('mkdir -p ' + outfile_path)
+    with open(outfile_path + 'vocab.pkl', 'wb') as outfile:
+        pickle.dump(vocab, outfile)
+
+    return vocab
+
+# create val/ts dtm files
+#create_val_dtm()
+#create_ts_dtm()
+
+# print(wiki_full_filtered_dtm.shape)
+# print(wiki_train_dtm.shape)
+vocab = json_to_pkl()
+n_docs_tr = wiki_train_dtm.shape[0]
+n_docs_val = wiki_val_dtm.shape[0]
+n_docs_ts = wiki_ts_dtm.shape[0]
+n_docs_full = wiki_full_dtm.shape[0]
+
+# Split test set in 2 halves
+print('splitting test documents in 2 halves...')
+ts_h1 = np.arange(0, n_docs_ts/2, dtype=int)
+ts_h2 = np.arange(n_docs_ts/2, n_docs_ts, dtype=int)
+bow_ts_h1 = wiki_ts_dtm[ts_h1]
+bow_ts_h2 = wiki_ts_dtm[ts_h2]
+
+n_docs_ts_h1 = bow_ts_h1.shape[0]
+n_docs_ts_h2 = bow_ts_h2.shape[0]
+
+# split bow intro token/value pairs
+print('splitting bow into token/value pairs and saving to disk...')
+def split_bow(bow_in, n_docs):
+    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
+    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
+    return indices, counts
+
+# split train
+bow_tr_tokens, bow_tr_counts = split_bow(wiki_train_dtm, n_docs_tr)
+savemat(outfile_path + 'bow_tr_tokens.mat', {'tokens': bow_tr_tokens}, do_compression=True)
+savemat(outfile_path + 'bow_tr_counts.mat', {'counts': bow_tr_counts}, do_compression=True)
+#del vocab
+del bow_tr_tokens
+del bow_tr_counts
+del n_docs_tr
+
+# split val
+bow_val_tokens, bow_val_counts = split_bow(wiki_val_dtm, n_docs_val)
+savemat(outfile_path + 'bow_va_tokens.mat', {'tokens': bow_val_tokens}, do_compression=True)
+savemat(outfile_path + 'bow_va_counts.mat', {'counts': bow_val_counts}, do_compression=True)
+del bow_val_tokens
+del bow_val_counts
+del n_docs_val
+
+# split test
+bow_ts_tokens, bow_ts_counts = split_bow(wiki_ts_dtm, n_docs_ts)
+savemat(outfile_path + 'bow_ts_tokens.mat', {'tokens': bow_ts_tokens}, do_compression=True)
+savemat(outfile_path + 'bow_ts_counts.mat', {'counts': bow_ts_counts}, do_compression=True)
+del bow_ts_tokens
+del bow_ts_counts
+del n_docs_ts
+
+bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
+savemat(outfile_path + 'bow_ts_h1_tokens.mat', {'tokens': bow_ts_h1_tokens}, do_compression=True)
+savemat(outfile_path + 'bow_ts_h1_counts.mat', {'counts': bow_ts_h1_counts}, do_compression=True)
+del bow_ts_h1
+del bow_ts_h1_tokens
+del bow_ts_h1_counts
+
+bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
+savemat(outfile_path + 'bow_ts_h2_tokens.mat', {'tokens': bow_ts_h2_tokens}, do_compression=True)
+savemat(outfile_path + 'bow_ts_h2_counts.mat', {'counts': bow_ts_h2_counts}, do_compression=True)
+del bow_ts_h2
+del bow_ts_h2_tokens
+del bow_ts_h2_counts
+
+print('Data ready !!')
+print('*************')
\ No newline at end of file
diff --git a/run_dvae_20ng.sh b/run_dvae_20ng.sh
new file mode 100644
index 0000000..8afd4fe
--- /dev/null
+++ b/run_dvae_20ng.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --time=24:00:00
+#SBATCH --partition=informatik-mind
+#SBATCH --ntasks=5
+#SBATCH --job-name=master-project
+#SBATCH --gres=gpu:V100:1
+#SBATCH --mem=20G
+#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log
+#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log
+
+module load anaconda3/latest
+. $ANACONDA_HOME/etc/profile.d/conda.sh
+
+conda activate dvae
+python3 soup_nuts/models/dvae/main.py \
+    --input_dir soup_nuts/models/etm/scripts/min_df_10_dvae_format \
+    --output_dir results/dvae-20ng \
+    --eval_path train.dtm.npz \
+    --num_topics 50
+
+conda deactivate
diff --git a/run_dvae_wiki.sh b/run_dvae_wiki.sh
new file mode 100644
index 0000000..d018dde
--- /dev/null
+++ b/run_dvae_wiki.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --time=24:00:00
+#SBATCH --partition=informatik-mind
+#SBATCH --ntasks=5
+#SBATCH --job-name=master-project
+#SBATCH --gres=gpu:V100:1
+#SBATCH --mem=20G
+#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log
+#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log
+
+module load anaconda3/latest
+. $ANACONDA_HOME/etc/profile.d/conda.sh
+
+conda activate dvae
+python3 soup_nuts/models/dvae/main.py \
+    --input_dir data/wikitext/processed_wiki \
+    --output_dir results/dvae-wiki \
+    --eval_path train.dtm.npz \
+    --num_topics 50
+
+conda deactivate
diff --git a/run_etm_20ng.sh b/run_etm_20ng.sh
new file mode 100644
index 0000000..f12fb59
--- /dev/null
+++ b/run_etm_20ng.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --time=96:00:00
+#SBATCH --partition=informatik-mind
+#SBATCH --ntasks=5
+#SBATCH --job-name=master-project
+#SBATCH --gres=gpu:V100:1
+#SBATCH --mem=20G
+#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log
+#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log
+
+module load anaconda3/latest
+. $ANACONDA_HOME/etc/profile.d/conda.sh
+
+conda activate etm	   
+python3 soup_nuts/models/etm/main.py \
+    --mode train \
+    --data_path soup_nuts/models/etm/data/20ng \
+    --output_dir results/etm-20ng \
+    --num_topics 50 \
+    --train_embeddings 1 \
+    --epochs 1000 \
+    --tc 1
+
+conda deactivate
+
-- 
GitLab