From 17e9cdbdee9b3b3cdccab416004a99547a7196a7 Mon Sep 17 00:00:00 2001
From: Ignacio Serantes <kde@aynoa.net>
Date: Sun, 17 Jun 2012 20:39:36 +0200
Subject: [PATCH 6/8] Fix: non numeric genres in id3 v2 mp3 are ignored.
REVIEW:105242.
---
id3endanalyzer.cpp | 646 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 646 insertions(+)
create mode 100644 id3endanalyzer.cpp
diff --git a/id3endanalyzer.cpp b/id3endanalyzer.cpp
new file mode 100644
index 0000000..677ece0
--- /dev/null
+++ b/id3endanalyzer.cpp
@@ -0,0 +1,646 @@
+/* This file is part of Strigi Desktop Search
+ *
+ * Copyright (C) 2006 Jos van den Oever <jos@vandenoever.info>
+ * 2009 Evgeny Egorochkin <phreedom.stdin@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this library; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "id3endanalyzer.h"
+#include "analysisresult.h"
+#include "../rdfnamespaces.h"
+#include <strigi/strigiconfig.h>
+#include <strigi/textutils.h>
+#include <strigi/stringstream.h>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <cstdlib>
+#include <iconv.h>
+
+#ifdef ICONV_SECOND_ARGUMENT_IS_CONST
+ #define ICONV_CONST const
+#else
+ #define ICONV_CONST
+#endif
+
+using namespace Strigi;
+using namespace std;
+
+const string
+ typePropertyName(
+ RDF "type"),
+ fullnamePropertyName(
+ NCO "fullname"),
+ titlePropertyName(
+ NIE "title"),
+ albumTrackCountPropertyName(
+ NMM_DRAFT "albumTrackCount"),
+ discNumberPropertyName(
+ NMM_DRAFT "setNumber"),
+ discCountPropertyName(
+ NMM_DRAFT "setSize"),
+
+ musicClassName(
+ NMM_DRAFT "MusicPiece"),
+ audioClassName(
+ NFO "Audio"),
+ albumClassName(
+ NMM_DRAFT "MusicAlbum"),
+ contactClassName(
+ NCO "Contact");
+
+/*
+ENCA autodetection of broken encodings. First, need to make sure it's going to be actually useful.
+ID3v2.0
+play counter:needs nepomuk resolution
+replaygain
++lyrics
++Improve:
+ creation date:
+ language: support multiple
+ Genre
+ album art type handling
+VBR detection
+*/
+
+static const string genres[148] = {
+ "Blues",
+ "Classic Rock",
+ "Country",
+ "Dance",
+ "Disco",
+ "Funk",
+ "Grunge",
+ "Hip-Hop",
+ "Jazz",
+ "Metal",
+ "New Age",
+ "Oldies",
+ "Other",
+ "Pop",
+ "R&B",
+ "Rap",
+ "Reggae",
+ "Rock",
+ "Techno",
+ "Industrial",
+ "Alternative",
+ "Ska",
+ "Death Metal",
+ "Pranks",
+ "Soundtrack",
+ "Euro-Techno",
+ "Ambient",
+ "Trip-Hop",
+ "Vocal",
+ "Jazz+Funk",
+ "Fusion",
+ "Trance",
+ "Classical",
+ "Instrumental",
+ "Acid",
+ "House",
+ "Game",
+ "Sound Clip",
+ "Gospel",
+ "Noise",
+ "Alternative Rock",
+ "Bass",
+ "Soul",
+ "Punk",
+ "Space",
+ "Meditative",
+ "Instrumental Pop",
+ "Instrumental Rock",
+ "Ethnic",
+ "Gothic",
+ "Darkwave",
+ "Techno-Industrial",
+ "Electronic",
+ "Pop-Folk",
+ "Eurodance",
+ "Dream",
+ "Southern Rock",
+ "Comedy",
+ "Cult",
+ "Gangsta",
+ "Top 40",
+ "Christian Rap",
+ "Pop/Funk",
+ "Jungle",
+ "Native American",
+ "Cabaret",
+ "New Wave",
+ "Psychedelic",
+ "Rave",
+ "Showtunes",
+ "Trailer",
+ "Lo-Fi",
+ "Tribal",
+ "Acid Punk",
+ "Acid Jazz",
+ "Polka",
+ "Retro",
+ "Musical",
+ "Rock & Roll",
+ "Hard Rock",
+ "Folk",
+ "Folk/Rock",
+ "National Folk",
+ "Swing",
+ "Fusion",
+ "Bebop",
+ "Latin",
+ "Revival",
+ "Celtic",
+ "Bluegrass",
+ "Avantgarde",
+ "Gothic Rock",
+ "Progressive Rock",
+ "Psychedelic Rock",
+ "Symphonic Rock",
+ "Slow Rock",
+ "Big Band",
+ "Chorus",
+ "Easy Listening",
+ "Acoustic",
+ "Humour",
+ "Speech",
+ "Chanson",
+ "Opera",
+ "Chamber Music",
+ "Sonata",
+ "Symphony",
+ "Booty Bass",
+ "Primus",
+ "Porn Groove",
+ "Satire",
+ "Slow Jam",
+ "Club",
+ "Tango",
+ "Samba",
+ "Folklore",
+ "Ballad",
+ "Power Ballad",
+ "Rhythmic Soul",
+ "Freestyle",
+ "Duet",
+ "Punk Rock",
+ "Drum Solo",
+ "A Cappella",
+ "Euro-House",
+ "Dance Hall",
+ "Goa",
+ "Drum & Bass",
+ "Club-House",
+ "Hardcore",
+ "Terror",
+ "Indie",
+ "BritPop",
+ "Negerpunk",
+ "Polsk Punk",
+ "Beat",
+ "Christian Gangsta Rap",
+ "Heavy Metal",
+ "Black Metal",
+ "Crossover",
+ "Contemporary Christian",
+ "Christian Rock",
+ "Merengue",
+ "Salsa",
+ "Thrash Metal",
+ "Anime",
+ "Jpop",
+ "Synthpop"
+};
+
+const uint32_t bitrate [15] = {0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 160000, 192000, 224000, 256000, 320000 };
+
+const uint32_t samplerate[3] = {44100, 48000, 32000};
+
+const char * encodings[5] = {"ISO-8859-1", "UTF-16", "UTF-16BE", "UTF-8", "UTF-16LE"};
+
+#ifndef _GNU_SOURCE
+size_t
+strnlen(const char *s, size_t maxlen) {
+ for(size_t i=0; i<maxlen; i++)
+ if (s[i]==0)
+ return i;
+ return maxlen;
+}
+#endif
+
+class UTF8Convertor {
+ private:
+ iconv_t const conv;
+ char *out;
+ size_t capacity;
+ public:
+ UTF8Convertor(const char *encoding);
+ const string convert(const char *data, size_t len);
+ ~UTF8Convertor();
+};
+UTF8Convertor::UTF8Convertor(const char *encoding) :conv(iconv_open("UTF-8", encoding)), out(0), capacity(0) {
+}
+UTF8Convertor::~UTF8Convertor() {
+ iconv_close(conv);
+ if (out) free(out);
+}
+const string
+UTF8Convertor::convert(const char *data, size_t len) {
+ if (!len)
+ return string();
+ if ( capacity<len*3 || // is the buffer too small or too large?
+ (capacity>10000 && capacity>len*8) ) {
+ capacity = len*3;
+ out = (char*)realloc(out, len*3);
+ }
+
+ char *result = out;
+ size_t reslen = capacity;
+
+ ICONV_CONST char *input = (char *)data;
+ iconv(conv, &input, &len, &result, &reslen);
+
+ return string(out,capacity-reslen);
+}
+
+void
+ID3EndAnalyzerFactory::registerFields(FieldRegister& r) {
+ createdField = r.registerField(NIE "contentCreated");
+ subjectField = r.registerField(NIE "subject");
+ titleField = r.registerField(titlePropertyName);
+ descriptionField = r.registerField(NIE "description");
+ commentField = r.registerField(NIE "comment");
+ albumField = r.registerField(NMM_DRAFT "musicAlbum");
+ genreField = r.registerField(NMM_DRAFT "genre");
+ composerField = r.registerField(NMM_DRAFT "composer");
+ performerField = r.registerField(NMM_DRAFT "performer");
+ lyricistField = r.registerField(NMM_DRAFT "lyricist");
+ publisherField = r.registerField(NCO "publisher");
+ languageField = r.registerField(NIE "language");
+ copyrightField = r.registerField(NIE "copyright");
+ trackNumberField = r.registerField(NMM_DRAFT "trackNumber");
+ discNumberField = r.registerField(discNumberPropertyName);
+ durationField = r.registerField(NFO "duration");
+ typeField = r.typeField;
+
+ bitrateField = r.registerField(NFO "averageBitrate");
+ samplerateField = r.registerField(NFO "sampleRate");
+ codecField = r.registerField(NFO "codec");
+ channelsField = r.registerField(NFO "channels");
+}
+
+inline
+void
+addStatement(AnalysisResult &indexable, string& subject, const string& predicate, const string& object) {
+ if (subject.empty())
+ subject = indexable.newAnonymousUri();
+ indexable.addTriplet(subject, predicate, object);
+}
+
+inline
+int32_t readAsyncSize(const unsigned char* b) {
+ return (((int32_t)b[0])<<21) + (((int32_t)b[1])<<14)
+ + (((int32_t)b[2])<<7) + ((int32_t)b[3]);
+}
+
+int32_t
+readSize(const unsigned char* b, bool async) {
+ const signed char* c = (const signed char*)b;
+ if (async) {
+ if (c[0] < 0 || c[1] < 0 || c[2] < 0 || c[3] < 0)
+ return -1;
+ return readAsyncSize(b);
+ }
+ return readBigEndianInt32(b);
+}
+bool
+ID3EndAnalyzer::checkHeader(const char* header, int32_t headersize) const {
+ const unsigned char* usbuf = (const unsigned char*)header;
+ int32_t i;
+
+ for(i=0; (header[i] == '\0') && (i<headersize); i++);
+ return (headersize>=6+i)
+ && (
+ (strncmp("ID3", header+i, 3) == 0 // check that it's ID3
+ && usbuf[3+i] <= 4 // only handle version <= 4
+ && (usbuf[5+i]&~0x80) == 0) // we're too dumb too handle other flags
+ ||
+ ((unsigned char)header[0+i] == 0xff && ((unsigned char)header[1+i]&0xfe) == 0xfa
+ && (unsigned char)header[2+i]>>4 != 0xf // MP3 frame header is ok too
+ && (((unsigned char)header[2+i]>>2)&3) != 3)
+ );
+
+}
+
+static void trim(string& s,const string& drop = " ")
+{
+ string r = s.erase(s.find_last_not_of(drop)+1);
+ r.erase(0, r.find_first_not_of(drop));
+}
+
+static bool extract_and_trim(const char* buf, int offset, int length, string& s)
+{
+ // We're extracting here the ID3v1 tags and doing some sanity checks:
+ // 1) Strip of all leading and prefixed spaces
+ // 2) Test if string contains at least something
+ if (!buf[offset])
+ return false;
+
+ s = string(buf + offset, strnlen(buf + offset, length));
+ trim(s);
+ // Return true if the extracted value is not empty (read: contains something)
+ return !s.empty();
+}
+
+signed char
+ID3EndAnalyzer::analyze(Strigi::AnalysisResult& indexable, Strigi::InputStream* in) {
+ const int max_padding = 1000;
+ if(!in)
+ return -1;
+
+ bool found_title = false, found_artist = false,
+ found_album = false, found_comment = false,
+ found_year = false, found_track = false,
+ found_genre = false, found_tag = false;
+ string albumUri;
+ char albumArtNum = '\0';
+
+ // read 10 byte header
+ const char* buf;
+ int32_t nread = in->read(buf, 10+max_padding, 10+max_padding);
+
+ // parse ID3v2* tag
+
+ if (nread == 10+max_padding && strncmp("ID3", buf, 3) == 0) { // check for ID3 header
+
+ bool async = buf[3] >= 4;
+ bool unsync = (buf[5] & 0x80)!=0;
+
+ // calculate size from 4 syncsafe bytes
+ int32_t size = readAsyncSize((unsigned char*)buf+6);
+ if (size < 0 || size > 5000000)
+ return -1;
+ size += 10+4+max_padding; // add the size of the ID3 header, MP3 frame header and possible padding generated by LAME(workaround)
+
+ // read the entire tag
+ in->reset(0);
+ nread = in->read(buf, size, size);
+ if (nread != size)
+ return -1;
+
+ found_tag = true;
+
+ const char* p = buf + 10;
+ buf += size-4-max_padding;
+ while (p < buf && *p) {
+ size = readSize((unsigned char*)p+4, async);
+ if (size <= 0 || size > (buf-p)-10) {
+ //cerr << "size < 0: " << size << endl;
+ break;
+ }
+
+ string value;
+ uint8_t enc = p[10];
+ const char *encoding = enc>4 ? encodings[0] : encodings[enc] ;
+ UTF8Convertor conv(encoding);
+ const char *decoded_value;
+ int32_t decoded_value_size;
+ string deunsyncbuf;
+ if (unsync) {
+ deunsyncbuf.reserve(size-1);
+ for(int32_t i = 0; i<size-1; i++)
+ if ( (i==0) || (p[11+i]!=0) || (p[10+i]!=0xff) )
+ deunsyncbuf.push_back(p[11+i]);
+ decoded_value = deunsyncbuf.c_str();
+ decoded_value_size = deunsyncbuf.length();
+ } else {
+ decoded_value = p+11;
+ decoded_value_size = size-1;
+ };
+
+ if (strncmp("APIC", p, 4) == 0) {
+ size_t mimelen = strnlen(decoded_value, decoded_value_size);
+ if ((int32_t)mimelen < decoded_value_size-3) {
+ const char *desc = decoded_value+mimelen+1+1;
+// uint8_t pictype = p[11+mimelen+1];
+ size_t desclen = strnlen(desc,decoded_value_size-mimelen-2-1);
+ const char *content = desc + desclen + 1 + (enc == 0 || enc == 3 ? 0:1) ;
+
+ if(content<decoded_value+decoded_value_size) {
+ StringInputStream picstream(content,
+ (uint32_t)(decoded_value+decoded_value_size-content), false);
+ string picname;
+ picname = (char)('0'+albumArtNum++);
+ indexable.indexChild(picname, indexable.mTime(), &picstream);
+
+ if (desclen && indexable.child()) {
+ if (enc == 0 || enc == 3) {
+ indexable.child()->addValue(factory->descriptionField, string(desc, desclen) );
+ } else {
+ indexable.child()->addValue(factory->descriptionField, conv.convert(desc, desclen) );
+ }
+ }
+
+ indexable.finishIndexChild();
+ }
+ }
+ }
+
+ if (enc == 0 || enc == 3) {
+ value = string(decoded_value, strnlen(decoded_value, decoded_value_size));
+ } else {
+ value = conv.convert(decoded_value, decoded_value_size); // FIXME: add similar workaround
+ }
+
+ if (!value.empty()) {
+ if (strncmp("TIT1", p, 4) == 0) {
+ indexable.addValue(factory->subjectField, value);
+ } else if (strncmp("TIT2", p, 4) == 0) {
+ indexable.addValue(factory->titleField, value);
+ found_title = true;
+ } else if (strncmp("TIT3", p, 4) == 0) {
+ indexable.addValue(factory->descriptionField, value);
+ } else if (strncmp("TLAN", p, 4) == 0) {
+ indexable.addValue(factory->languageField, value);
+ } else if (strncmp("TCOP", p, 4) == 0) {
+ indexable.addValue(factory->copyrightField, value);
+ } else if ((strncmp("TDRL", p, 4) == 0) ||
+ (strncmp("TDAT", p, 4) == 0) ||
+ (strncmp("TYER", p, 4) == 0) ||
+ (strncmp("TDRC", p, 4) == 0)) {
+ indexable.addValue(factory->createdField, value);
+ found_year = true;
+ } else if ((strncmp("TPE1", p, 4) == 0) ||
+ (strncmp("TPE2", p, 4) == 0) ||
+ (strncmp("TPE3", p, 4) == 0) ||
+ (strncmp("TPE4", p, 4) == 0)) {
+ string performerUri = indexable.newAnonymousUri();
+
+ indexable.addValue(factory->performerField, performerUri);
+ indexable.addTriplet(performerUri, typePropertyName, contactClassName);
+ indexable.addTriplet(performerUri, fullnamePropertyName, value);
+ found_artist = true;
+ } else if ((strncmp("TPUB", p, 4) == 0) ||
+ (strncmp("TENC", p, 4) == 0)) {
+ string publisherUri = indexable.newAnonymousUri();
+
+ indexable.addValue(factory->publisherField, publisherUri);
+ indexable.addTriplet(publisherUri, typePropertyName, contactClassName);
+ indexable.addTriplet(publisherUri, fullnamePropertyName, value);
+ } else if ((strncmp("TALB", p, 4) == 0) ||
+ (strncmp("TOAL", p, 4) == 0)) {
+ addStatement(indexable, albumUri, titlePropertyName, value);
+ found_album = true;
+ } else if (strncmp("TCON", p, 4) == 0) {
+ // The Genre is stored as (number)
+ if( value[0] == '(' && value[value.length()-1] == ')' ) {
+ //vHanda: Maybe one should check if all the characters in between are digits
+ int genreIndex = atoi( value.substr( 1, value.length()-1 ).c_str() );
+ indexable.addValue(factory->genreField, genres[ genreIndex ]);
+ found_genre = true;
+ } else {
+ // We must not forget that genre could be a string.
+ if (!value.empty()) {
+ indexable.addValue(factory->genreField, value);
+ found_genre = true;
+ }
+ }
+ } else if (strncmp("TLEN", p, 4) == 0) {
+ indexable.addValue(factory->durationField, value);
+ } else if (strncmp("TEXT", p, 4) == 0) {
+ string lyricistUri = indexable.newAnonymousUri();
+
+ indexable.addValue(factory->lyricistField, lyricistUri);
+ indexable.addTriplet(lyricistUri, typePropertyName, contactClassName);
+ indexable.addTriplet(lyricistUri, fullnamePropertyName, value);
+ } else if (strncmp("TCOM", p, 4) == 0) {
+ string composerUri = indexable.newAnonymousUri();
+
+ indexable.addValue(factory->composerField, composerUri);
+ indexable.addTriplet(composerUri, typePropertyName, contactClassName);
+ indexable.addTriplet(composerUri, fullnamePropertyName, value);
+ } else if (strncmp("TRCK", p, 4) == 0) {
+ istringstream ins(value);
+ int tnum;
+ ins >> tnum;
+ if (!ins.fail()) {
+ indexable.addValue(factory->trackNumberField, tnum);
+ found_track = true;
+ ins.ignore(10,'/');
+ int tcount;
+ ins >> tcount;
+ if (!ins.fail()) {
+ ostringstream outs;
+ outs << tcount;
+ addStatement(indexable, albumUri, albumTrackCountPropertyName, outs.str());
+ }
+ }
+ } else if (strncmp("TPOS", p, 4) == 0) {
+ istringstream ins(value);
+ int dnum;
+ ins >> dnum;
+ if (!ins.fail()) {
+ indexable.addValue(factory->discNumberField, dnum);
+ ins.ignore(10,'/');
+ int dcount;
+ ins >> dcount;
+ if (!ins.fail()) {
+ ostringstream outs;
+ outs << dcount;
+ addStatement(indexable, albumUri, discCountPropertyName, outs.str());
+ }
+ }
+ }
+ }
+ p += size + 10;
+ }
+ }
+ // parse MP3 frame header
+
+ int bitrateindex, samplerateindex;
+ int i;
+ for(i=0; (buf[i]=='\0') && (i<max_padding) ; i++);
+ if (((unsigned char)buf[0+i] == 0xff) && (((unsigned char)buf[1+i]&0xfe) == 0xfa)
+ && ((bitrateindex = ((unsigned char)buf[2+i]>>4)) != 0xf)
+ && ((samplerateindex = (((unsigned char)buf[2+i]>>2)&3)) != 3 )) { // is this MP3?
+
+ indexable.addValue(factory->typeField, audioClassName);
+ // FIXME: no support for VBR :(
+ // ideas: compare bitrate from the frame with stream size/duration from ID3 tags
+ // check several consecutive frames to see if bitrate is different
+ // in neither case you can be sure to properly detected VBR :(
+ indexable.addValue(factory->bitrateField, bitrate[bitrateindex]);
+ indexable.addValue(factory->samplerateField, samplerate[samplerateindex]);
+ indexable.addValue(factory->codecField, "MP3");
+ indexable.addValue(factory->channelsField, ((buf[3+i]>>6) == 3 ? 1:2 ) );
+ }
+
+ // Parse ID3v1 tag
+
+ int64_t insize;
+ if ( (insize = in->size()) > (128+nread)) {
+
+ // read the tag and check signature
+ int64_t nskip = insize-128-nread;
+ if (nskip == in->skip(nskip))
+ if (in->read(buf, 128, 128)==128)
+ if (!strncmp("TAG", buf, 3)) {
+
+ found_tag = true;
+
+ std::string s;
+
+ if (!found_title && extract_and_trim(buf, 3, 30, s)) {
+ indexable.addValue(factory->titleField, s);
+ }
+ if (!found_artist && extract_and_trim(buf, 33, 30, s)) {
+ const string performerUri = indexable.newAnonymousUri();
+ indexable.addValue(factory->performerField, performerUri);
+ indexable.addTriplet(performerUri, typePropertyName, contactClassName);
+ indexable.addTriplet(performerUri, fullnamePropertyName, s);
+ }
+ if (!found_album && extract_and_trim(buf, 63, 30, s))
+ addStatement(indexable, albumUri, titlePropertyName, s);
+ if (!found_year && extract_and_trim(buf, 93, 4, s))
+ indexable.addValue(factory->createdField, s);
+ if (!found_comment && extract_and_trim(buf, 97, 30, s)) {
+ indexable.addValue(factory->commentField, s);
+ }
+ if (!found_track && !buf[125] && buf[126]) {
+ indexable.addValue(factory->trackNumberField, (int)(buf[126]));
+ }
+ if (!found_genre && (unsigned char)(buf[127]) < 148)
+ indexable.addValue(factory->genreField, genres[(uint8_t)buf[127]]);
+ }
+ }
+
+ if(!albumUri.empty()) {
+ indexable.addValue(factory->albumField, albumUri);
+ indexable.addTriplet(albumUri, typePropertyName, albumClassName);
+ }
+
+ if (found_tag)
+ indexable.addValue(factory->typeField, musicClassName);
+
+ return 0;
+}
--
1.7.10.4