-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_chapterize.py
90 lines (67 loc) · 3.26 KB
/
test_chapterize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# pytest tests
import pytest
import shutil
import json
# test files: english episode, from Accidental Tech Podcast
transcript_test_file_path = "test_files/atp367.mp3_transcript.json" # atp 367
transcript_segmented_file_path = "test_files/atp367_chapters.json" # atp 367
preprocessed_documents_path = "test_files/preprocessed_tokenized.json" # atp 368
# test data fixtures
@pytest.fixture
def transcript_file_path(tmp_path):
path = tmp_path.joinpath('transcript.json')
shutil.copyfile(transcript_test_file_path, path)
return path
@pytest.fixture
def transcript_json():
with open(transcript_test_file_path, 'r') as f:
j = json.load(f)
return j
@pytest.fixture
def segmented_transcript():
with open(transcript_segmented_file_path, 'r') as f:
j = json.load(f)
return j
@pytest.fixture
def preprocessed_documents():
with open(preprocessed_documents_path, 'r') as f:
j = json.load(f)
return j
# @pytest.fixture
# def segmented_trancript():
def test_chapterizer(transcript_json):
from transcribe.SpeechToTextModules.SpeechToTextModule import TranscriptToken
from chapterize.chapterizer import Chapterizer
tokens = [TranscriptToken.from_dict(token) for token in transcript_json['tokens']]
chapterizer = Chapterizer()
concat_segments, minima = chapterizer.chapterize(tokens, boundaries=transcript_json['boundaries'], language='en', visual=False)
assert len(concat_segments) > 1
def test_chapterizer_no_boundaries(transcript_json):
from transcribe.SpeechToTextModules.SpeechToTextModule import TranscriptToken
from chapterize.chapterizer import Chapterizer
tokens = [TranscriptToken.from_dict(token) for token in transcript_json['tokens']]
chapterizer = Chapterizer()
concat_segments, minima = chapterizer.chapterize(tokens, boundaries=[], language='en', visual=False)
assert len(concat_segments) > 1
# def test_chapter_namer(segmented_transcript):
# # tokens = transcript_json['']
# assert segmented_transcript[0] == None
def test_document_vectorizer(preprocessed_documents):
from chapterize.document_vectorizer import DocumentVectorizer
from chapterize.chapterizer import Chapterizer
from scipy import sparse
chapterizer = Chapterizer() # import chapterizer to access default hyperparameters
methods = ['ft_sif_average','tfidf', 'ft_average', 'ft_sum']
for method in methods:
dv = DocumentVectorizer(chapterizer.tfidf_min_df, chapterizer.tfidf_max_df)
document_vectors = dv.vectorize_docs(method, preprocessed_documents, language='en')
assert isinstance(document_vectors, sparse.csr.csr_matrix), f"method {method}: document vectors should be of type csr_matrix"
assert document_vectors.shape[0] == len(preprocessed_documents), f"method {method}: there should be as many document vectors as input documents"
def test_boundary_refinement():
from chapterize.chapterizer import Chapterizer
chapterizer = Chapterizer(max_utterance_delta=3) # import chapterizer to access default hyperparameters
boundaries = [1,5,12,20,29,40,55]
true_boundaries = [2,3,9,13,24,27,43,60]
refined_target = [2,3,13,20,27,43,55]
refined_boundaries = chapterizer.refine_boundaries(boundaries, true_boundaries)
assert refined_boundaries == refined_target