Tag enhancement: find typos

Download and get help for different MediaMonkey for Windows 4 Addons.

Moderators: Peke, Gurus

GuHu
Posts: 63
Joined: Mon Feb 12, 2007 6:25 am

Tag enhancement: find typos

Post by GuHu »

Hello,

after some small scripts for personal purpose I wrote my first script whie may be interesting for other guys ..

My aim is to get better tags. Scripts like personal tag enhancer and tagging inconsistencies helped me a lot, but I found that typos give pain to me.

Thus I wrote a little script which lead to typos in the authors name. Simply copy it to the auto-folder. It creates a node "Files to edit". Since the script is a little slooooowwww, it treats only songs from the main window.

Songs can be treated with drag and drop.

Hints to make the script faster are welcome :D

Here is the script, have fun:


Code: Select all


'
' MediaMonkey Script
'
' NAME: Typo 0.1
'
' AUTHOR: GuHu
' DATE : 01/18/08
'
'
' INSTALL: Copy to Scripts/auto directory
'

Option Explicit

Dim Max : Max=4  ' max distance
Dim d(180,180)  ' matrix here and not in function for performance
Dim Tree, Node, Subnode, Artistnode,Sartistnode

Sub OnStartup

'add node
Set Tree = SDB.MainTree
Set Node = Tree.CreateNode
Node.Caption = SDB.Localize("Typos")
Node.IconIndex = 49
Tree.AddNode Tree.Node_FilesToEdit, Node, 2
Node.HasChildren = True

'add Artists
set Subnode = Tree.CreateNode
Subnode.Caption = SDB.Localize("Artists")
Subnode.IconIndex = 0
Subnode.UseScript = Script.ScriptPath
Subnode.OnFillChildren = "FillArtists"
Tree.AddNode Node, Subnode, 3
Subnode.HasChildren = True

set Sartistnode = Tree.CreateNode

End Sub



' some needed functions
' partly (c) from levenshtein.de
' converted from VB to VBA by GuHu
'
'*******************************
'*** Get minimum of three values
'*******************************

Private Function Minimum(a,b,c)

Dim mi
                          
  mi = a
  If b < mi Then
    mi = b
  End If
  If c < mi Then
    mi = c
  End If
  
  Minimum = mi
                          
End Function

'********************************
'*** Compute Levenshtein Distance
'********************************

Public Function LD(s,t)

''Dim d(90,90)  ' matrix (not here for performance reasons)
Dim m  ' length of t
Dim n  ' length of s
Dim i  ' iterates through s
Dim j  ' iterates through t
Dim s_i  ' ith character of s
Dim t_j  ' jth character of t
Dim cost ' cost
Dim Min  ' minimum
  
  ' Step 1
  
  n = Len(s)
  m = Len(t)
  If n = 0 Then
    LD = m
    Exit Function
  End If 
  If m = 0 Then
    LD = n
    Exit Function
  End If 

  ' Step 2

  
  For i = 0 To n
    d(i, 0) = i
  Next
  
  For j = 0 To m
    d(0, j) = j
  Next

  ' Step 3

  For i = 1 To n
    
    s_i = Mid(s, i, 1)
    Min=180

    ' Step 4
    

    For j = 1 To m
      
      t_j = Mid(t, j, 1)
      
      ' Step 5
      
      If s_i = t_j Then
        cost = 0
      Else
        cost = 1
      End If
      
      ' Step 6
      
      d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost)
      if Min>d(i,j) then
        Min=d(i,j)
      end if     
    Next
        
  if not (Min<Max) then
     LD=Min
     exit function
  end if
  Next
  
  ' Step 7
  
  LD = d(n, m)

''  Erase d   (for performance reasons)

End Function


Sub FillTracks(Node)

  Dim sql,Tracks

  Set Tracks = SDB.MainTracksWindow
  sql = "WHERE Songs.Artist = """ & Node.Caption & """"
  Tracks.AddTracksFromQuery(sql)
  Tracks.FinishAdding
End Sub

Function TrackDragDrop( destNode, srcNode, SongList, DropType, Test)

if Test then
 TrackDragDrop = 2
else
  Dim i, itm
  For i=0 To SongList.Count-1
    Set itm = SongList.Item(i)
    if itm.ArtistName=itm.AlbumArtistName then
      itm.AlbumArtistName = destNode.Caption
    end if
    itm.ArtistName = destNode.Caption
    itm.UpdateDB
    itm.UpdateArtist
  Next
end if

End Function


Sub FillArtists(Subnode)

Dim list,artlist,i,j,art1,art2,dd,a1,a2,res,isnode
Dim Progress

Set Tree = SDB.MainTree
'Set list = SDB.CurrentSongList 
Set list = SDB.AllVisibleSongList 
Set artlist = list.Artists
Set Progress = SDB.Progress
Progress.Text = SDB.Localize("Find similar Artists ...")
Progress.MaxValue = artlist.count


for i= 0 to artlist.count - 1

  Progress.Value = i+1
  if Progress.Terminate then
    exit for
  end if

  set art1=artlist.item(i)
  a1=art1.Name
  isnode = False

  for j=i+1 to artlist.count-1
    set art2=artlist.item(j)
    a2=art2.Name

    dd=LD(a1,a2)

    if (dd < Minimum(Max,Len(a1)*0.5,Len(a2)*0.5)) then
      if Not (isnode) then
        isnode=True
        Set Artistnode = Tree.CreateNode
        Artistnode.Caption = a1
        Tree.AddNode Subnode, Artistnode, 3
        Artistnode.HasChildren = True
        Set Sartistnode = Tree.CreateNode
        Sartistnode.Caption = a1
        Sartistnode.OnDragDrop = "TrackDragDrop"
        Sartistnode.OnFillTracksFunct = "FillTracks"
        Sartistnode.UseScript = Script.ScriptPath
        Tree.AddNode Artistnode, Sartistnode, 3
      end if

      Set Sartistnode = Tree.CreateNode
      Sartistnode.Caption = a2
      Sartistnode.OnDragDrop = "TrackDragDrop"
      Sartistnode.OnFillTracksFunct = "FillTracks"
      Sartistnode.UseScript = Script.ScriptPath
      Tree.AddNode Artistnode, Sartistnode, 3
     end if
  next
next


End Sub

RedX
Posts: 366
Joined: Wed Dec 27, 2006 10:32 am
Location: Germany

Post by RedX »

What kind of typos does it correct and what fields does it search in or change?

Dose it automatically correct or only list the files with typos?

(Would be nice to know before i let it out on my db)

Regards,
Red

PS: Doesn't seem to work here nothing appears not even a progressbar that it is comparing anything...

How do you use this??
GuHu
Posts: 63
Joined: Mon Feb 12, 2007 6:25 am

Post by GuHu »

Hy RedX

it shows (possible) typos with max. 4 errors, i.e. you can transform the field with max. 4 manipulations to the right one.

Actually, it analyze only the tracks in the main window, e.g. in order to analyze your whole library, all songs have to appear in the main window. You can test it with a subset if you want.

It do not correct it automatically because it can not decide wether it is a typo or only nearly equal fields.

Actually, only the author field is analyzed. It can be expanded to other fields.

GuHu
Last edited by GuHu on Sun Jan 20, 2008 12:42 am, edited 1 time in total.
bob61
Posts: 157
Joined: Sun Dec 09, 2007 4:52 pm

Post by bob61 »

GuHu - This looks good. I ran through my Artists, 40,000 titles and it took a while. Would like to see it be a little faster, but better than doing in manually.

I cleaned up quite a few typos. Encountered some strange things with a typo listed with no titles, not sure why. I didn't get through all the artists I need to clean up this pass so will go through it again when I have more time.

Thanks for posting your script!
GuHu
Posts: 63
Joined: Mon Feb 12, 2007 6:25 am

Post by GuHu »

Hy bob61,

thanks! What do you mean with typos with no titles? Tracks with no titles or tracks with no artist? Tracks with no titltes should be possible since the artists are compared. If the aartists are empty, there are maybe some not visible characters there.

40,000 tracks.. uhh. I have about 60,000 tracks and after the first run there were so many typos that I did it in different sessions too.

GuHu
bob61
Posts: 157
Joined: Sun Dec 09, 2007 4:52 pm

Post by bob61 »

GuHu wrote:Hy bob61,

thanks! What do you mean with typos with no titles? Tracks with no titles or tracks with no artist? Tracks with no titltes should be possible since the artists are compared. If the aartists are empty, there are maybe some not visible characters there.
GuHu
I had artists listed that had no tracks listed, when I selected the artist there were not tracks shown. Not sure why, will see if I can find a pattern.
GuHu
Posts: 63
Joined: Mon Feb 12, 2007 6:25 am

Post by GuHu »

Hy bob61,

ok I understand. Does the artist name include a quote sign ( ' ) ?

GuHu
bob61
Posts: 157
Joined: Sun Dec 09, 2007 4:52 pm

Post by bob61 »

GuHu wrote:Hy bob61,

ok I understand. Does the artist name include a quote sign ( ' ) ?

GuHu
I didn't notice that, but didn't really look into the details. What I have found is I've gone back to create the typo list again and when I open the "Artist" node the script doesn't execute. Not sure why it worked before and now I get nothing. RedX mentioned similar so must be something with the selection process. I have put songs into the selection window and run script, tried to highlight and then have script run, nothing.

Just my opinion, this should be more appropriately titled as "Find Similar Artist Spellings" as the way I see it working is it's not as specific to typos as it really is to similar names - still did find it useful!
Thufir Hawat
Posts: 6
Joined: Thu Nov 09, 2006 6:40 am

Post by Thufir Hawat »

Hi GuHu,

i made some improvements to your script:
  • *) exchanged the leveshtein distance to the more general damerau-levenshtein distance (algorithm from wikibooks http://en.wikibooks.org/wiki/Algorithm_ ... plications with some little changes), that is nearly the same but counts swaped caracters as a single difference instead of two. should be faster too as this implementation goes along a diagonal stripe of the matrix and does not compare strings whitch have differece in lenght more than the given max. but still slow as complexity is O(|s1|*|s2|) at best.

    *) the script now treats only the selected songs in the main window.

    *) the script handles album typos too.

    *) as i don't want albums like "xxxx - CD1" and "xxxx - CD2" caunted as typos the script will not care for " - CD"+$NUMBER$ and " - Vol."+$NUMBER$. if you have another way to name cd's of a set like (CD?) you have to modify lines 251 to 270.

Code: Select all

' MediaMonkey Script
'
' NAME: Typo 0.2
' AUTHOR: Thufir Hawat
' DATE:2008.03.13
'
' Original-AUTHOR: GuHu
' DATE : 01/18/08
'
'
' INSTALL: Copy to Scripts/auto directory
'

Option Explicit
 
Dim ArtistMax : ArtistMax=4  ' max distance for Artists
Dim AlbumMax : AlbumMax=4  ' max distance for Albums
Dim d(250,250)  ' matrix here and not in function for performance
Dim Tree, Node, Subnode, Artistnode,Sartistnode,Albumnode,Salbumnode

Sub OnStartup

	'add node
	Set Tree = SDB.MainTree
	Set Node = Tree.CreateNode
	Node.Caption = SDB.Localize("Typos")
	Node.IconIndex = 49
	Tree.AddNode Tree.Node_FilesToEdit, Node, 2
	Node.HasChildren = True

	'add Artists
	set Subnode = Tree.CreateNode
	Subnode.Caption = SDB.Localize("Artists")
	Subnode.IconIndex = 0
	Subnode.UseScript = Script.ScriptPath
	Subnode.OnFillChildren = "FillArtists"
	Tree.AddNode Node, Subnode, 3
	Subnode.HasChildren = True

	'add Album
	set Subnode = Tree.CreateNode
	Subnode.Caption = SDB.Localize("Albums")
	Subnode.IconIndex = 0
	Subnode.UseScript = Script.ScriptPath
	Subnode.OnFillChildren = "FillAlbums"
	Tree.AddNode Node, Subnode, 3
	Subnode.HasChildren = True
	set Sartistnode = Tree.CreateNode

End Sub

'This function returns the Levenshtein distance capped by the limit parameter.
Function damerau_levenshtein(s1, s2, limit, result)
                    
    Dim diagonal 
    Dim horizontal 
    Dim vertical 
    Dim swap 
    Dim final 
   
    'Start of the strings analysis
    If result(Len(s1), Len(s2)) < 1 OR result(Len(s1), Len(s2)) = "" Then
        If Abs(Len(s1) - Len(s2)) >= limit OR result(Len(s1), Len(s2)) >= limit Then
            final = limit
        Else
            If Len(s1) = 0 Or Len(s2) = 0 Then
                'End of recursivity
                final = Len(s1) + Len(s2)
            Else
            
                'Core of levenshtein algorithm
                If Mid(s1, 1, 1) = Mid(s2, 1, 1) Then
                    final = damerau_levenshtein(Mid(s1, 2), Mid(s2, 2), limit, result)
                Else
                    
                    If Mid(s1, 1, 1) = Mid(s2, 2, 1) And Mid(s1, 2, 1) = Mid(s2, 1, 1) Then
                        'Damerau extension counting swapped letters
                        swap = damerau_levenshtein(Mid(s1, 3), Mid(s2, 3), limit - 1, result)
                        final = 1 + swap
                    Else
                        'The function minimum is implemented via the limit parameter.
                        'The diagonal search usually reaches the limit the quickest.
                        diagonal = damerau_levenshtein(Mid(s1, 2), Mid(s2, 2), limit - 1, result)
                        horizontal = damerau_levenshtein(Mid(s1, 2), s2, diagonal, result)
                        vertical = damerau_levenshtein(s1, Mid(s2, 2), horizontal, result)
                        final = 1 + vertical
                    End If
                End If
                
            End If
        End If
    Else
        'retrieve intermediate result
        final = result(Len(s1), Len(s2)) - 1
    End If
        
    'returns the distance capped by the limit
    If final < limit Then
        damerau_levenshtein = final
        'store intermediate result
        result(Len(s1), Len(s2)) = final + 1
    Else
        damerau_levenshtein = limit
    End If
    
End Function

Sub FillTracks(Node)

  Dim sql,Tracks

  Set Tracks = SDB.MainTracksWindow
  sql = "WHERE Songs.Artist = """ & Node.Caption & """"
  Tracks.AddTracksFromQuery(sql)
  Tracks.FinishAdding
End Sub


Sub FillAlbumTracks(Node)

  Dim sql,Tracks

  Set Tracks = SDB.MainTracksWindow
  sql = "WHERE Songs.Album = """ & Node.Caption & """"
  Tracks.AddTracksFromQuery(sql)
  Tracks.FinishAdding
End Sub

Function TrackDragDrop( destNode, srcNode, SongList, DropType, Test)

	if Test then
		TrackDragDrop = 2
	else
	Dim i, itm
		For i=0 To SongList.Count-1
			Set itm = SongList.Item(i)
			if itm.ArtistName=itm.AlbumArtistName then
				itm.AlbumArtistName = destNode.Caption
			end if
			itm.ArtistName = destNode.Caption
			itm.UpdateDB
			itm.UpdateArtist
			itm.WriteTags
		Next	
	end if
End Function

Function AlbumTrackDragDrop( destNode, srcNode, SongList, DropType, Test)

	if Test then
		AlbumTrackDragDrop = 2
	else
		Dim i, itm
		For i=0 To SongList.Count-1
			Set itm = SongList.Item(i)
			itm.AlbumName = destNode.Caption
			itm.UpdateDB
			itm.UpdateAlbum
			itm.WriteTags
		Next
	end if
End Function

Sub FillArtists(Subnode)

	Dim list,artlist,i,j,art1,art2,dd,a1,a2,res,isnode
	Dim Progress

	Set Tree = SDB.MainTree
	'Set list = SDB.CurrentSongList
	'Set list = SDB.AllVisibleSongList
	Set list = SDB.SelectedSongList
	Set artlist = list.Artists
	Set Progress = SDB.Progress
	Progress.Text = SDB.Localize("Find similar Artists ...")
	Progress.MaxValue = artlist.count

	for i= 0 to artlist.count - 1
		Progress.Value = i+1

		if Progress.Terminate then
			exit for
		end if

		set art1=artlist.item(i)
		a1=art1.Name
		isnode = False

		for j=i+1 to artlist.count-1
			set art2=artlist.item(j)
			a2=art2.Name
			Erase d
			dd=damerau_levenshtein(a1,a2,ArtistMax+1,d)
				
			if dd < ArtistMax then
				if Not (isnode) then
					isnode=True
					Set Artistnode = Tree.CreateNode
					Artistnode.Caption = a1
					Tree.AddNode Subnode, Artistnode, 3
					Artistnode.HasChildren = True
					Set Sartistnode = Tree.CreateNode
					Sartistnode.Caption = a1
					Sartistnode.OnDragDrop = "TrackDragDrop"
					Sartistnode.OnFillTracksFunct = "FillTracks"
					Sartistnode.UseScript = Script.ScriptPath
					Tree.AddNode Artistnode, Sartistnode, 3
				end if

				Set Sartistnode = Tree.CreateNode
				Sartistnode.Caption = a2
				Sartistnode.OnDragDrop = "TrackDragDrop"
				Sartistnode.OnFillTracksFunct = "FillTracks"
				Sartistnode.UseScript = Script.ScriptPath
				Tree.AddNode Artistnode, Sartistnode, 3
			end if
		next
	next
End Sub 

Sub FillAlbums(Subnode)

	Dim list,albumlist,i,j,album1,album2,dd,a1,a2,res,isnode,i1,i2
	Dim Progress

	Set Tree = SDB.MainTree
	'Set list = SDB.CurrentSongList
	'Set list = SDB.AllVisibleSongList
	Set list = SDB.SelectedSongList
	Set albumlist = list.Albums
	Set Progress = SDB.Progress
	Progress.Text = SDB.Localize("Find similar Albums ...")
	Progress.MaxValue = albumlist.count


	for i=0 to albumlist.count-1

		Progress.Value = i+1
		if Progress.Terminate then
			exit for
		end if

		set album1=albumlist.item(i)
		a1=album1.Name
		isnode=False

		for j=i+1 to albumlist.count-1
			set album2=albumlist.item(j)
			a2=album2.Name
			i1=InStr(a1, " - CD")
			i2=InStr(a2, " - CD")
			if i1 > 0 AND i2 > 0 AND IsNumeric(Mid(a1,i1+5,1)) AND Mid(a1,i1+5,1) = Mid(a1,i1+5,1) then
					a1=Mid(a1,i1+5,1)=" "
					a2=Mid(a2,i1+5,1)=" "
					a1=Replace(a1," - CD" ,"")
					a2=Replace(a2," - CD ","")
			end if 
			
			i1=0
			i2=0
			i1=InStr(a1, " - Vol.")
			i2=InStr(a2, " - Vol.")
			if i1 > 0 AND i2 > 0 AND IsNumeric(Mid(a1,i1+7,1)) AND Mid(a1,i1+7,1) = Mid(a1,i1+7,1) then
					a1=Mid(a1,i1+7,1)=" "
					a2=Mid(a2,i1+7,1)=" "
					a1=Replace(a1, " - Vol. ", "")
					a2=Replace(a2, " - Vol. ", "")
			end if 
		
			if Not a1 = a2 then
				Erase d
				dd=damerau_levenshtein(a1,a2,AlbumMax+1,d)
	
					if dd < AlbumMax then
					if Not (isnode) then
						isnode=True
						Set Albumnode = Tree.CreateNode
						Albumnode.Caption = a1
						Tree.AddNode Subnode, Albumnode, 3
						Albumnode.HasChildren = True
						Set Salbumnode = Tree.CreateNode
						Salbumnode.Caption = a1
						Salbumnode.OnDragDrop = "AlbumTrackDragDrop"
						Salbumnode.OnFillTracksFunct = "FillAlbumTracks"
						Salbumnode.UseScript = Script.ScriptPath
						Tree.AddNode Albumnode, Salbumnode, 3
					end if

					Set Salbumnode = Tree.CreateNode
					Salbumnode.Caption = a2
					Salbumnode.OnDragDrop = "AlbumTrackDragDrop"
					Salbumnode.OnFillTracksFunct = "FillAlbumTracks"
					Salbumnode.UseScript = Script.ScriptPath
					Tree.AddNode Albumnode, Salbumnode, 3
				end if
			end if
		next
	next
End Sub
RedX
Posts: 366
Joined: Wed Dec 27, 2006 10:32 am
Location: Germany

Post by RedX »

after you collapse and make a new selection it doesn't update the node contents. You have to restart to be able to analyze a new batch of files :-/
Post Reply