-
Notifications
You must be signed in to change notification settings - Fork 0
/
inventory_catalogs_and_matching.py
148 lines (131 loc) · 7.54 KB
/
inventory_catalogs_and_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import argparse
import pandas as pd
from mysql.connector.errors import ProgrammingError
from sqlalchemy import exc
from config.paths import audio_path, video_path, midi_path
from db_handler.db_handler import DatabaseHandler
from utils.catalog_utils import create_catalog, cleanup_file_titles, setup_collection_directory
from utils.catalog_utils import get_clean_song_titles_from_spotify, collect_matched_files
from utils.common_utils import script_start_time, script_run_time
try:
from fingerprinting import djv
except ProgrammingError:
db = DatabaseHandler()
db.create_db('dejavu')
from fingerprinting import djv
# set pandas print_options for debugging purposes
pd.options.display.width = 0
# Initiate the parser
parser = argparse.ArgumentParser()
parser.add_argument("-M", "--mode", help="""Which part of the pipeline you need to run?
Choose between:\n all,\n midi,\n audio,\n merge""")
args = parser.parse_args()
if __name__ == '__main__':
acceptable_modes = ['all', 'midi', 'audio', 'video', 'merge']
if args.mode in acceptable_modes:
mode = args.mode
print('Running in {} mode'.format(mode))
script_start_time()
else:
raise ValueError("Please provide a valid value for --mode")
# we will remove the windows hidden files
irrelevant_files = re.compile('(desktop\.ini)|(.*\.(jpg|db|txt|url|srt|info|nfo))')
try:
db = DatabaseHandler('file_system_catalogs')
except exc.OperationalError:
print('No such DB found. Setting up the Database')
db = DatabaseHandler()
db.create_db('file_system_catalogs')
db = DatabaseHandler('file_system_catalogs')
db_connection = db.connection
if mode in ["midi", "all"]:
# start with the midi files
print('starting midi catalog')
# Each midi file also has an indexing file that starts with .-
# We remove those
irrelevant_midi = re.compile('\._|\.DS_Store')
irrelevant_dir = ['lmd_matched', 'cariart', 'download-midi']
midi_catalog = create_catalog(midi_path, irrelevant_dir, irrelevant_midi)
print('finish initial midi catalog')
# clean -up the midi title names
print('start midi catalog cleanup')
if not db.check_for_existing_tables('midi_catalog'):
db.execute_from_file('./db_handler/sql/create_midi_catalog.sql')
midi_catalog = cleanup_file_titles(midi_catalog, "midi", allow_numbers=True)
midi_catalog = get_clean_song_titles_from_spotify(midi_catalog)
midi_catalog.drop_duplicates(inplace=True)
try:
already_there = pd.read_sql_table('midi_catalog', con=db_connection)
midi_catalog = midi_catalog.loc[~midi_catalog['filename'].isin(already_there['filename'])]
midi_catalog.to_sql('midi_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
except ValueError:
midi_catalog.to_sql('midi_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
matches = midi_catalog[midi_catalog['spotify_name'].notna()]
recall = 100 * len(matches) / len(midi_catalog)
print('The recall is %i per cent' % recall)
# TODO: the match rate is at 67%. Try pylast or discogs_client to see if match rate improves.
if mode in ["audio", "all"]:
# continue with the audio files
print('start audio catalog')
if not db.check_for_existing_tables('audio_catalog'):
db.execute_from_file('./db_handler/sql/create_audio_catalog.sql')
audio_catalog = create_catalog(audio_path, except_file=irrelevant_files, except_dir=['Bootlegs'])
audio_catalog = cleanup_file_titles(audio_catalog, "audio")
audio_catalog = get_clean_song_titles_from_spotify(audio_catalog)
audio_catalog.drop_duplicates(inplace=True)
try:
already_there = pd.read_sql_table('audio_catalog', con=db_connection)
audio_catalog = audio_catalog.loc[~audio_catalog['filename'].isin(already_there['filename'])]
audio_catalog.to_sql('audio_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
except ValueError:
audio_catalog.to_sql('audio_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
print('finish audio catalog')
if mode in ['video', 'all']:
# continue with the video files
if not db.check_for_existing_tables('video_catalog'):
db.execute_from_file('./db_handler/sql/create_video_catalog.sql')
print('start video catalog')
video_catalog = create_catalog(video_path, except_file=irrelevant_files)
video_catalog = cleanup_file_titles(video_catalog, "video", allow_numbers=True)
video_catalog['full_path'] = video_catalog['directory'] + '/' + video_catalog['filename']
video_catalog['searched'] = 0
try:
already_there = pd.read_sql_table('video_catalog', con=db_connection)
video_catalog = video_catalog.loc[~video_catalog['filename'].isin(already_there['filename'])]
video_catalog.to_sql('video_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
except ValueError:
video_catalog.to_sql('video_catalog', con=db_connection, if_exists='append', index=False, index_label='id')
print('finish video catalog')
if mode in ['merge', 'all']:
print('Loading tables')
if not db.check_for_existing_tables('midi_audio_matches'):
db.execute_from_file('./db_handler/sql/create_audio_video_matches_table.sql')
midi = pd.read_sql_table('midi_catalog', con=db_connection)
audio = pd.read_sql_table('audio_catalog', con=db_connection)
pos_midi = midi[midi['spotify_name'].notna()]
pos_audio = audio[audio['spotify_name'].notna()]
print('keep matches and drop duplicates')
merged = pos_midi.merge(right=pos_audio, how='inner', on='spotify_URL', suffixes=('_midi', '_audio'))
merged.drop_duplicates(subset='spotify_URL', inplace=True)
print('creating the directories to store files')
new_midi_dir, new_audio_dir, new_video_dir = setup_collection_directory()
print('directories created successfully')
print('collecting the files')
merged['pair_id'] = merged.apply(collect_matched_files, new_midi_path=new_midi_dir,
new_audio_path=new_audio_dir, axis=1)
merged['djv_song_id'] = 'null'
print('writing to db')
try:
already_there = pd.read_sql_table('midi_audio_matches', con=db_connection)
merged = merged.loc[~merged['pair_id'].isin(already_there['pair_id'])]
merged[['pair_id', 'index_midi', 'index_audio', 'djv_song_id']].to_sql('midi_audio_matches',
con=db_connection,
index=False,
if_exists='append')
except ValueError:
merged[['pair_id', 'index_midi', 'index_audio', 'djv_song_id']].to_sql('midi_audio_matches',
con=db_connection,
index=False,
if_exists='append')
script_run_time()