From 1a0b3671286277c680e9d2e537a5216f8c0760e7 Mon Sep 17 00:00:00 2001 From: Saida Yusupova <yusupova@rhrk.uni-kl.de> Date: Thu, 15 Dec 2022 02:25:29 +0100 Subject: [PATCH] dvae to etm format change --- dvae_to_etm.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++ run_dvae_20ng.sh | 22 +++++++++ run_dvae_wiki.sh | 22 +++++++++ run_etm_20ng.sh | 26 ++++++++++ 4 files changed, 194 insertions(+) create mode 100644 dvae_to_etm.py create mode 100644 run_dvae_20ng.sh create mode 100644 run_dvae_wiki.sh create mode 100644 run_etm_20ng.sh diff --git a/dvae_to_etm.py b/dvae_to_etm.py new file mode 100644 index 0000000..e2def4c --- /dev/null +++ b/dvae_to_etm.py @@ -0,0 +1,124 @@ +import os +import json +import pickle +import random +import numpy as np +from scipy import sparse +from pathlib import Path +from scipy.io import savemat, loadmat + +infile_path = './data/wikitext/processed_wiki/' +outfile_path = './data/wikitext/processed_wiki/etm-format/' +wiki_train_dtm = sparse.load_npz(Path(infile_path, "train.dtm.npz")) +wiki_val_dtm = sparse.load_npz(Path(infile_path, "val.dtm.npz")) +wiki_ts_dtm = sparse.load_npz(Path(infile_path, "ts.dtm.npz")) + +wiki_full_dtm = sparse.load_npz(Path(infile_path, "full.dtm.npz")) +wiki_full_filtered_dtm = sparse.load_npz(Path(infile_path, "full_filtered.dtm.npz")) +wiki_tr_va_filtered_dtm = sparse.load_npz(Path(infile_path, "full_train_val_filtered.dtm.npz")) + +def create_val_dtm(): + # prepate val.dtm.npz file + wiki_val_size = 4200 + rand_ind = np.random.randint(0, wiki_full_filtered_dtm.shape[0], wiki_val_size) + wiki_val = wiki_full_filtered_dtm[rand_ind, :] + # save val set + print("saving val set...") + sparse.save_npz(Path(infile_path, "val.dtm.npz"), wiki_val) + +def create_ts_dtm(): + # prepate val.dtm.npz file + wiki_ts_size = 4200 + rand_ind = np.random.randint(0, wiki_tr_va_filtered_dtm.shape[0], wiki_ts_size) + wiki_ts = wiki_tr_va_filtered_dtm[rand_ind, :] + # save val set + print("saving val set...") + sparse.save_npz(Path(infile_path, "ts.dtm.npz"), wiki_ts) + +def json_to_pkl(): + # Converting json file into pickle file + print('converting json file to pickle file...') + # read json file + with open(infile_path + 'vocab.json', 'r') as infile: + json_obj = json.load(infile) + pkl_obj = pickle.loads(pickle.dumps(json_obj)) + vocab = list(pkl_obj.keys()) + + # write into pkl file + if not os.path.isdir(outfile_path): + os.system('mkdir -p ' + outfile_path) + with open(outfile_path + 'vocab.pkl', 'wb') as outfile: + pickle.dump(vocab, outfile) + + return vocab + +# create val/ts dtm files +#create_val_dtm() +#create_ts_dtm() + +# print(wiki_full_filtered_dtm.shape) +# print(wiki_train_dtm.shape) +vocab = json_to_pkl() +n_docs_tr = wiki_train_dtm.shape[0] +n_docs_val = wiki_val_dtm.shape[0] +n_docs_ts = wiki_ts_dtm.shape[0] +n_docs_full = wiki_full_dtm.shape[0] + +# Split test set in 2 halves +print('splitting test documents in 2 halves...') +ts_h1 = np.arange(0, n_docs_ts/2, dtype=int) +ts_h2 = np.arange(n_docs_ts/2, n_docs_ts, dtype=int) +bow_ts_h1 = wiki_ts_dtm[ts_h1] +bow_ts_h2 = wiki_ts_dtm[ts_h2] + +n_docs_ts_h1 = bow_ts_h1.shape[0] +n_docs_ts_h2 = bow_ts_h2.shape[0] + +# split bow intro token/value pairs +print('splitting bow into token/value pairs and saving to disk...') +def split_bow(bow_in, n_docs): + indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)] + counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)] + return indices, counts + +# split train +bow_tr_tokens, bow_tr_counts = split_bow(wiki_train_dtm, n_docs_tr) +savemat(outfile_path + 'bow_tr_tokens.mat', {'tokens': bow_tr_tokens}, do_compression=True) +savemat(outfile_path + 'bow_tr_counts.mat', {'counts': bow_tr_counts}, do_compression=True) +#del vocab +del bow_tr_tokens +del bow_tr_counts +del n_docs_tr + +# split val +bow_val_tokens, bow_val_counts = split_bow(wiki_val_dtm, n_docs_val) +savemat(outfile_path + 'bow_va_tokens.mat', {'tokens': bow_val_tokens}, do_compression=True) +savemat(outfile_path + 'bow_va_counts.mat', {'counts': bow_val_counts}, do_compression=True) +del bow_val_tokens +del bow_val_counts +del n_docs_val + +# split test +bow_ts_tokens, bow_ts_counts = split_bow(wiki_ts_dtm, n_docs_ts) +savemat(outfile_path + 'bow_ts_tokens.mat', {'tokens': bow_ts_tokens}, do_compression=True) +savemat(outfile_path + 'bow_ts_counts.mat', {'counts': bow_ts_counts}, do_compression=True) +del bow_ts_tokens +del bow_ts_counts +del n_docs_ts + +bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1) +savemat(outfile_path + 'bow_ts_h1_tokens.mat', {'tokens': bow_ts_h1_tokens}, do_compression=True) +savemat(outfile_path + 'bow_ts_h1_counts.mat', {'counts': bow_ts_h1_counts}, do_compression=True) +del bow_ts_h1 +del bow_ts_h1_tokens +del bow_ts_h1_counts + +bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2) +savemat(outfile_path + 'bow_ts_h2_tokens.mat', {'tokens': bow_ts_h2_tokens}, do_compression=True) +savemat(outfile_path + 'bow_ts_h2_counts.mat', {'counts': bow_ts_h2_counts}, do_compression=True) +del bow_ts_h2 +del bow_ts_h2_tokens +del bow_ts_h2_counts + +print('Data ready !!') +print('*************') \ No newline at end of file diff --git a/run_dvae_20ng.sh b/run_dvae_20ng.sh new file mode 100644 index 0000000..8afd4fe --- /dev/null +++ b/run_dvae_20ng.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --time=24:00:00 +#SBATCH --partition=informatik-mind +#SBATCH --ntasks=5 +#SBATCH --job-name=master-project +#SBATCH --gres=gpu:V100:1 +#SBATCH --mem=20G +#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log +#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log + +module load anaconda3/latest +. $ANACONDA_HOME/etc/profile.d/conda.sh + +conda activate dvae +python3 soup_nuts/models/dvae/main.py \ + --input_dir soup_nuts/models/etm/scripts/min_df_10_dvae_format \ + --output_dir results/dvae-20ng \ + --eval_path train.dtm.npz \ + --num_topics 50 + +conda deactivate diff --git a/run_dvae_wiki.sh b/run_dvae_wiki.sh new file mode 100644 index 0000000..d018dde --- /dev/null +++ b/run_dvae_wiki.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --time=24:00:00 +#SBATCH --partition=informatik-mind +#SBATCH --ntasks=5 +#SBATCH --job-name=master-project +#SBATCH --gres=gpu:V100:1 +#SBATCH --mem=20G +#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log +#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log + +module load anaconda3/latest +. $ANACONDA_HOME/etc/profile.d/conda.sh + +conda activate dvae +python3 soup_nuts/models/dvae/main.py \ + --input_dir data/wikitext/processed_wiki \ + --output_dir results/dvae-wiki \ + --eval_path train.dtm.npz \ + --num_topics 50 + +conda deactivate diff --git a/run_etm_20ng.sh b/run_etm_20ng.sh new file mode 100644 index 0000000..f12fb59 --- /dev/null +++ b/run_etm_20ng.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --time=96:00:00 +#SBATCH --partition=informatik-mind +#SBATCH --ntasks=5 +#SBATCH --job-name=master-project +#SBATCH --gres=gpu:V100:1 +#SBATCH --mem=20G +#SBATCH -o /scratch/yusupova/server_dump/output.%j.%N.log +#SBATCH -e /scratch/yusupova/server_dump/error.%j.%N.log + +module load anaconda3/latest +. $ANACONDA_HOME/etc/profile.d/conda.sh + +conda activate etm +python3 soup_nuts/models/etm/main.py \ + --mode train \ + --data_path soup_nuts/models/etm/data/20ng \ + --output_dir results/etm-20ng \ + --num_topics 50 \ + --train_embeddings 1 \ + --epochs 1000 \ + --tc 1 + +conda deactivate + -- GitLab