Commit e6ea9b8b authored by Lukáš Lalinský's avatar Lukáš Lalinský

Script to deduplicate tracks

parent 5d53ef9a
......@@ -7,6 +7,7 @@ from acoustid import tables as schema, const
from acoustid.data.fingerprint import lookup_fingerprint, insert_fingerprint, inc_fingerprint_submission_count, FingerprintSearcher
from acoustid.data.musicbrainz import find_puid_mbids, resolve_mbid_redirect
from acoustid.data.track import insert_track, insert_mbid, insert_puid, merge_tracks, insert_track_meta, can_add_fp_to_track, can_merge_tracks, insert_track_foreignid
logger = logging.getLogger(__name__)
......
......@@ -5,6 +5,8 @@ import logging
import uuid
from sqlalchemy import sql
from acoustid import tables as schema, const
from acoustid.data.fingerprint import lookup_fingerprint, insert_fingerprint, inc_fingerprint_submission_count, FingerprintSearcher
from acoustid.data.musicbrainz import find_puid_mbids, resolve_mbid_redirect
logger = logging.getLogger(__name__)
......@@ -332,3 +334,44 @@ def can_add_fp_to_track(conn, track_id, fingerprint, length):
return False
return True
def find_track_duplicates(conn, fingerprint, index=None):
with conn.begin():
searcher = FingerprintSearcher(conn, index)
searcher.min_score = const.TRACK_MERGE_THRESHOLD
matches = searcher.search(fingerprint['fingerprint'], fingerprint['length'])
if not matches:
logger.debug("Not matched itself!")
return
logged = False
match = matches[0]
all_track_ids = set()
possible_track_ids = set()
for m in matches:
if m['track_id'] in all_track_ids:
continue
all_track_ids.add(m['track_id'])
if can_add_fp_to_track(conn, m['track_id'], fingerprint['fingerprint'], fingerprint['length']):
if m['id'] != fingerprint['id']:
if not logged:
logger.debug("Deduplicating fingerprint %d", fingerprint['id'])
logged = True
logger.debug("Fingerprint %d with track %d is %d%% similar", m['id'], m['track_id'], m['score'] * 100)
possible_track_ids.add(m['track_id'])
if len(possible_track_ids) > 1:
for group in can_merge_tracks(conn, possible_track_ids):
if len(group) > 1:
target_track_id = min(group)
group.remove(target_track_id)
#logger.debug("Would like to merge tracks %r into %d", list(group), target_track_id)
merge_tracks(conn, target_track_id, list(group))
#raise Exception(1)
break
conn.execute("INSERT INTO fingerprint_deduplicate (id) VALUES (%s)", fingerprint['id'])
def find_duplicates(conn, limit=50, index=None):
query = "SELECT f.id, fingerprint, length FROM fingerprint f LEFT JOIN fingerprint_deduplicate d ON f.id=d.id WHERE d.id IS NULL ORDER BY f.id LIMIT 1000"
for fingerprint in conn.execute(query):
find_track_duplicates(conn, fingerprint, index=index)
#!/usr/bin/env python
# Copyright (C) 2011 Lukas Lalinsky
# Distributed under the MIT license, see the LICENSE file for details.
import chromaprint
from acoustid.script import run_script
from acoustid.data.track import find_duplicates
from acoustid.data.fingerprint import FingerprintSearcher
def main(script, opts, args):
conn = script.engine.connect()
find_duplicates(conn, index=script.index)
searcher = FingerprintSearcher(conn, index)
matches = searcher.search(fingerprint['fingerprint'], fingerprint['length'])
track_gid = None
for m in matches:
track_gid = m['track_gid']
break
run_script(main)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment