Javaによるテキストクラスタリングの実装

中文理解においてcarrot2が不十分であるため、ネットワーク上のリソースを参考に、このコードを提供します。

この実装では、文字や語の出現頻度を計算し、スコアを付与して、最も重要な語彙を抽出する方法を取りました。以下は実行可能なコードです。

ClusterBuilder.java

/** <br></br>* 
* @author  
* @version 作成日時：2011-3-8 午後02:02:36
* クラスタービルダー
 */
public class ClusterBuilder {
    private static final Log LOG;
    private List<DocCluster> clusters;
    private ICTHit[] docs;
    private int maxLevels;
    private ClusteringOptions[] options;
    private boolean useTagsAsTitle;
    private String wordsExcluded;
    private static short[] bit1Table;
 
    static {
        LOG = LogFactory.getLog(ClusterBuilder.class.getName());
 
        bit1Table = new short[65536];
 
        for (int n = 0; n < bit1Table.length; n++) {
            String s = Integer.toBinaryString(n);
            short m = 0;
            for (int k = 0; k < s.length(); k++) {
                if (s.charAt(k) == '1') {
                    m = (short) (m + 1);
                }
            }
            bit1Table[n] = m;
        }
    }
 
    private static int getValidBitCount(long n) {
        int i3 = (int) (n % 65536L);
        n /= 65536L;
        int i2 = (int) (n % 65536L);
        n /= 65536L;
        int i1 = (int) (n % 65536L);
        n /= 65536L;
        int i0 = (int) (n % 65536L);
        return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3];
    }
 
    private static int getDocHitCount(long[] hits) {
        assert (hits != null);
        if (hits == null)
            return 0;
        int n0 = 0;
        for (int i = 0; i < hits.length; i++) {
            n0 += getValidBitCount(hits[i]);
        }
        return n0;
    }
 
    public ClusterBuilder() {
        for (int n = 0; n < bit1Table.length; n++)
        {
            String s = Integer.toBinaryString(n);
            short m = 0;
            for (int k = 0; k < s.length(); k++)
            {
                if (s.getBytes()[k] == '1')
                {
                    m = (short)(m + 1);
                }
            }
            bit1Table[n] = m;
        }
    }
    /**
     * 
     * @param docsToCluster クラスタリング対象のドキュメントリスト
     * @param exWords 使用しないタグリスト。複数のタグは半角コンマで区切る。これらのタグはタグとして使用されない。
     * @param maxLevels 最大クラスターレベル
     * @param useTagsAsTitle タグをカテゴリタイトルとして使用するかどうか。使用しない場合はドキュメントタイトルから自動的に生成される。
     */
    public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) {
        this.useTagsAsTitle = useTagsAsTitle;
        this.wordsExcluded = exWords;
        this.maxLevels = maxLevels;
        this.docs = docsToCluster;
        this.options = new ClusteringOptions[3];
        this.options[0] = new ClusteringOptions();
        this.options[0].setDocMaxTagCount(10);
        this.options[0].setMinTagRelevance(60);
        this.options[0].setMinSameDocPercent(80);
 
        this.options[1] = new ClusteringOptions();
        this.options[1].setDocMaxTagCount(8);
        this.options[1].setMinTagRelevance(85);
        this.options[1].setMinSameDocPercent(70);
        this.options[1].setTagMinDocCount(2);
        this.options[1].setMinSameDocs(2);
 
        this.options[2] = new ClusteringOptions();
        this.options[2].setDocMaxTagCount(8);
        this.options[2].setMinTagRelevance(50);
        this.options[2].setMinSameDocPercent(70);
        this.options[2].setTagMinDocCount(2);
        this.options[2].setMinSameDocs(2);
    }
    /**
     * ドキュメントリストに対してクラスタリングを実行し、結果をClustersに格納する
     */
    public void cluster() {
        this.clusters = createLevelClusters(docs, 0, options[0]);
        List subs = null;
        if (this.maxLevels <= 1) {
            return;
        }
        for (DocCluster dc : this.clusters) {
            if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他"))
                continue;
            subs = createLevelClusters(dc.getDocList(), 1, options[1]);
            if (subs.size() > 1)
                dc.setSubclusters(subs);
        }
    }
    /**
     * レベルごとのクラスタを作成
     * @param docs ドキュメントリスト
     * @param level レベル番号
     * @param levelOpt このレベルのクラスタリングオプション
     * @return
     */
    private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) {
        TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount());
        List clusters = new ArrayList();
        int i, ValidTagCount;
        int DocCount = 0;
        // ドキュメントリストを走査し、各ドキュメントのタグリストに基づいてタグドキュメントマトリクスを初期化
        for (i = 0; i < docs.length; i++) {
            ICTHit d = docs[i];
            int validTagCount = 0;
            if (d.getTagList() != null) {
                String[] tagList = d.getTagList();
                for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) {
                    String tag = tagList[tagIdx].trim();
                     // タグ長が6文字を超えるものは除外
                    if ((tag.length() <= 0)
                            || (tag.length() > 20)
                            || ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded
                                    .contains(tag)))))
                        continue;
                    matrix.AddDocHit(tag, i);
                    validTagCount++;
                }
            }
 
        }
 
        int maxKwDocCount = 0;
        List entryListToRemove = new ArrayList();
        String kwWithMaxDocCount = "";
        LOG.debug("有効なキーワード：");
        for (Map.Entry entry : matrix.entrySet()) {
            // 現在のタグの検出ドキュメント数を統計し、設定値未満であれば削除
            int n = getDocHitCount((long[]) entry.getValue());
            if (n < levelOpt.getTagMinDocCount()) {
                entryListToRemove.add((String) entry.getKey());
            } else {
                LOG.debug((String) entry.getKey() + "(" + n + "), ");
 
                DocCount += n;
            }
            if (n > maxKwDocCount) {
                maxKwDocCount = n;
                kwWithMaxDocCount = (String) entry.getKey();
            }
        }
        LOG.debug("");
 
        LOG.debug("無視されたキーワード：");
 
        for (i = 0; i < entryListToRemove.size(); i++) {
            LOG.debug((String) entryListToRemove.get(i) + ", ");
            matrix.remove(entryListToRemove.get(i));
        }
 
        LOG.debug("");
 
        LOG.debug(entryListToRemove.size() + "個のキーワードは無視されました。残り" + matrix.size() + "個のキーワード。");
 
        LOG.debug("最大ドキュメント数のキーワード：" + kwWithMaxDocCount + "、ドキュメント数：" + maxKwDocCount + "。");
 
        double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D;
        LOG.debug("キーワード平均ドキュメント数：" + docCountPerTag);
 
        levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level)));
        if (levelOpt.getMinSameDocs() < 1) {
            levelOpt.setMinSameDocs(1);
        }
 
        while (mergeClusters(matrix, levelOpt) > 0) {
        }
        return createResult(matrix, docs, level, levelOpt);
    }
 
    private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) {
        if (matrix.size() == 0)
            return 0;
        long[] docHitsMerged = (long[]) null;
        long[] maxDocHitsMerged = (long[]) null;
        String word1 = "";
        String word2 = "";
        String word1ToMerge = "";
        String word2ToMerge = "";
        int i,j;
        int sameDocs = 0;
        // 関連度配列を初期化（0〜100点、101項目）
        List rankMatrix = new ArrayList();
        for (i = 0; i < 101; i++) {
            rankMatrix.add(new ArrayList());
        }
        List matrix2List = new ArrayList();
        matrix2List.addAll(matrix.entrySet());
        // タグドキュメントマトリクスのタグを2つずつ比較
        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
            Map.Entry hits1 = (Map.Entry) matrix2List.get(i1);
            word1 = (String) hits1.getKey();
            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
                Map.Entry hits2 = (Map.Entry) matrix2List.get(i2);
                word2 = (String) hits2.getKey();
                Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2),
                        docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
                // 2つの語の関連性を計算し、ドキュメントマージ表と一致ドキュメント数を取得
                int nRank = ((Integer) re[0]).intValue();
                docHitsMerged = (long[]) re[1];
                sameDocs = ((Integer) re[2]).intValue();
                // 関連性が閾値未満のものは無視
                if (nRank >= opt.getMinTagRelevance()) {
                    ((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2));
                }
 
            }
 
        }
 
        List tagListToRemove = new ArrayList();
        List entryListMerged = new ArrayList();
        entryListMerged.add(new TagHitEntry("", null));
        HashSet idPairTable = new HashSet();
        TagHitEntry entryToMerge1;
        while (true) {
            // 最高関連性を持つ2つのタグを見つける
            for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){};
            if (i < opt.getMinTagRelevance()) {
                break;
            }
            IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0);
            // 2つのカテゴリを結合
            ((List) rankMatrix.get(i)).remove(0);
            
            entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1))
                    : (TagHitEntry) entryListMerged.get(-ip.Id1);
            TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2))
                    : (TagHitEntry) entryListMerged.get(-ip.Id2);
            word1ToMerge = entryToMerge1.key;
            word2ToMerge = entryToMerge2.key;
            assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0));
 
            String wordsMerged = word1ToMerge + "," + word2ToMerge;
            long[] lDocs0 = entryToMerge1.value;
            long[] lDocs1 = entryToMerge2.value;
            maxDocHitsMerged = new long[matrix.hitsItemCount];
            for (i = 0; i < lDocs0.length; i++) {
                lDocs0[i] |= lDocs1[i];// マージされたドキュメントセットを取得
            }
            if (ip.Id1 >= 0)
                tagListToRemove.add(word1ToMerge);
            else
                entryListMerged.set(-ip.Id1, new TagHitEntry("", null));
            if (ip.Id2 >= 0)
                tagListToRemove.add(word2ToMerge);
            else {
                entryListMerged.set(-ip.Id2, new TagHitEntry("", null));
            }
            entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged));
            // 結合されたタグに関連する他のタグペアのスコアを置換
            int idMerged = -(entryListMerged.size() - 1);
            int id2 = 0;
 
            boolean CanDelete = false;
 
            for (i = 0; i <= 100; i++) {
                int ListCount = ((List) rankMatrix.get(i)).size();
                if (ListCount == 0) {
                    continue;
                }
 
                for (j = 0; j < ListCount; j++) {
                    IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j);
                    CanDelete = false;
                    if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) {
                        id2 = p.Id2;
                        CanDelete = true;
                    } else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) {
                        id2 = p.Id1;
                        CanDelete = true;
                    }
                    if (!CanDelete)
                        continue;
                    if (idMerged == id2) {
                        continue;
                    }
 
                    ((List) rankMatrix.get(i)).remove(j);
                    j--;
                    ListCount--;
 
                    IdPair pairMerged = new IdPair(idMerged, id2);
                    if (idPairTable.contains(pairMerged)) {
                        continue;
                    }
 
                    TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2))
                            : (TagHitEntry) entryListMerged.get(-id2);
 
                    assert ((e2.key.length() != 0) && (e2.key != wordsMerged));
 
                    Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged,
                            sameDocs, opt, matrix.hitsItemCount);
                    int rank = ((Integer) re[0]).intValue();
                    docHitsMerged = (long[]) re[1];
                    sameDocs = ((Integer) re[2]).intValue();
 
                    if (rank <= opt.getMinTagRelevance())
                        continue;
                    ((List) rankMatrix.get(rank)).add(pairMerged);
                    idPairTable.add(pairMerged);
                }
 
            }
 
        }
        // 結合されたタグを削除
        for (int m =0;m<tagListToRemove.size();m++){
            matrix.remove(tagListToRemove.get(m));
        }
        /**
        for (String w : tagListToRemove)
            matrix.remove(w);
        **/    
        // 新たに結合されたタグを追加
        for (int n=0;n<entryListMerged.size();n++){
            TagHitEntry e = (TagHitEntry) entryListMerged.get(n);
            matrix.put(e.getKey(), e.getValue());
        }
        /**
        for (TagHitEntry e : entryListMerged) {
            if (e.getKey().length() > 0)
                matrix.put(e.getKey(), e.getValue());
        }
        **/
        return 0;
    }
 
    private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) {
        if (matrix.size() == 0)
            return 0;
        long[] docHitsMerged = (long[]) null;
        long[] maxDocHitsMerged = (long[]) null;
        int nMaxRank = 0;
        String word1 = "";
        String word2 = "";
        String word1ToMerge = "";
        String word2ToMerge = "";
        int sameDocs = 0;
 
        List matrix2List = new ArrayList();
        matrix2List.addAll(matrix.entrySet());
 
        for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
            TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1));
            word1 = hits1.getKey();
            for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
                TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2));
                word2 = hits2.getKey();
                Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
                int nRank = ((Integer) re[0]).intValue();
                docHitsMerged = (long[]) re[1];
                sameDocs = ((Integer) re[2]).intValue();
 
                if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance()))
                    continue;
                nMaxRank = nRank;
                maxDocHitsMerged = docHitsMerged;
                word1ToMerge = word1;
                word2ToMerge = word2;
            }
 
        }
 
        if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) {
            return 0;
        }
 
        String wordsMerged = word1ToMerge + "," + word2ToMerge;
        if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) {
            matrix.remove(word1ToMerge);
            matrix.remove(word2ToMerge);
            matrix.put(wordsMerged, maxDocHitsMerged);
            LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")");
 
            return 1;
        }
 
        return 0;
    }
 
    private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount,
            ClusteringOptions opt, int hitsItemCount) {
        Object[] re = new Object[3];
        docHitsMerged = new long[hitsItemCount];
        sameDocCount = 0;
 
        String tag1 = entry1.getKey();
        String tag2 = entry2.getKey();
        assert (tag2 != tag1);
 
        long[] lDocs0 = entry1.getValue();
        long[] lDocs1 = entry2.getValue();
        int n0 = 0;
        int n1 = 0;
        n0 = getDocHitCount(lDocs0);
        n1 = getDocHitCount(lDocs1);
        int docCountMin = Math.min(n0, n1);
        int docCountMax = Math.max(n0, n1);
        int docCountMerged = 0;
 
        long sameDocBits = 0L;
        long diffDocBits = 0L;
        int diffDocCount = 0;
        for (int i = 0; i < lDocs0.length; i++) {
            docHitsMerged[i] = lDocs0[i] | lDocs1[i];// マージされたドキュメントセットを取得
            docCountMerged += getValidBitCount(docHitsMerged[i]);
            diffDocBits = lDocs0[i] ^ lDocs1[i];// 異なるドキュメントセットを取得
            diffDocCount += getValidBitCount(diffDocBits);
            sameDocBits = lDocs0[i] & lDocs1[i];// 同じドキュメントセットを取得
            sameDocCount += getValidBitCount(sameDocBits);
        }
 
        boolean IsSubstring = false;
        // 一方が他方の部分文字列の場合、スコアが高くなる
        if ((tag2.contains(tag1)) || (tag1.contains(tag2))) {
            IsSubstring = true;
            docCountMin += opt.getTagMinDocCount();
        }
 
        if ((sameDocCount == 0) && (!IsSubstring)) {
            re[0] = Integer.valueOf(0);
            re[1] = docHitsMerged;
            re[2] = Integer.valueOf(sameDocCount);
            return re;
        }
 
        if (docCountMin < opt.getTagMinDocCount()) {
            re[0] = Integer.valueOf(0);
            re[1] = docHitsMerged;
            re[2] = Integer.valueOf(sameDocCount);
            return re;
        }
 
        int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged);
        int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin);
        int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged);
        LOG.debug("関連性：" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2);
        LOG.debug(", SamePercent=" + samePercent);
        LOG.debug(", SamePercentMin=" + samePercentMin);
        LOG.debug(", DiffPercent=" + diffPercent);
        int nRank;
        if ((sameDocCount >= opt.getMinSameDocs())
                && ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) {
            nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D);
        } else {
            nRank = 0;
        }
        if (IsSubstring)
            nRank += 80;
        LOG.debug(", Rank=" + nRank);
 
        re[0] = Integer.valueOf(Math.min(nRank, 100));
        re[1] = docHitsMerged;
        re[2] = Integer.valueOf(sameDocCount);
        return re;
    }
 
    private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) {
        return new TagHitEntry((String) e.getKey(), (long[]) e.getValue());
    }
 
    @SuppressWarnings("unchecked")
    private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) {
        int i,j;
        Map<String,DocValue> clsIdList = new HashMap();
        List ClassTitleList = new ArrayList();
        for (Map.Entry de : matrix.entrySet()) {
            DocValue dv = new DocValue();
            clsIdList.put((String) de.getKey(), dv);
        }
 
        List<Integer> otherIdList = new ArrayList();
        TagHitEntry maxTagHitEntry = new TagHitEntry();
        int clsCount;
        String tag;
        // 各ドキュメントが属するカテゴリを決定
        for (i = 0; i < docs.length; i++) {
            ICTHit d = docs[i];
            TagHitMatrix.ClusterDocInfo di = matrix.docs[i];
            assert (docs[i] != null);
            int maxTagHit = 0;
            clsCount = 0;
 
            for (Map.Entry hits : matrix.entrySet()) {
                int tagHitCount = 0;
                int score = 0;
                String clsWordListStr = "," + (String) hits.getKey() + ",";
                // そのカテゴリに現在のドキュメントのタグが最も多く含まれている場合、そのカテゴリに属する
                for (j = 0; j < di.TagCount; j++) {
                    tag = di.TagList[j];
                    score = j < 3 ? 2 : 1;
                    assert (tag.length() > 0);
                    if (!clsWordListStr.contains("," + tag + ","))
                        continue;
                    tagHitCount += score;
                    clsCount++;
                }
 
                if (maxTagHit >= tagHitCount)
                    continue;
                maxTagHit = tagHitCount;
                maxTagHitEntry = mapEntry2TagHitEntry(hits);
            }
 
            if (maxTagHit > 0) {
                DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey());
                dv.idList.add(Integer.valueOf(i));
            } else {
                otherIdList.add(Integer.valueOf(i));
            }
 
        }
        // カテゴリリストを生成
        List<DocCluster> clusterList = new ArrayList();
        String[] TagList;
        Object dc;
        for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) {
            DocValue dv = (DocValue) kv.getValue();
            if (dv.idList.size() <= 0)
                continue;
            if (dv.idList.size() == 1) {
                otherIdList.add((Integer) dv.idList.get(0));
            } else {
                dc = new DocCluster();
                ((DocCluster) dc).setDocIdList(new String[dv.idList.size()]);
                ((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]);
                for (i = 0; i < dv.idList.size(); i++) {
                    ((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId();
                    ((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()];
                }
                ((DocCluster) dc).setLevel(level);
                ((DocCluster) dc).setTags((String) kv.getKey());
 
                for (i = 0; (i < clusterList.size())
                        && (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) {
                    i++;
                }
                clusterList.add(i, (DocCluster) dc);
            }
        }
        for (i = opt.getMaxClusterCount(); i < clusterList.size();) {
            DocCluster c = (DocCluster) clusterList.get(i);
            List idList = ((DocValue) clsIdList.get(c.getTags())).idList;
            for (dc = idList.iterator(); ((Iterator) dc).hasNext();) {
                int idx = ((Integer) ((Iterator) dc).next()).intValue();
                otherIdList.add(Integer.valueOf(idx));
            }
            clusterList.remove(i);
        }
        int i1;
        for (i = 0; i < clusterList.size(); i++) {
            DocCluster dc1 = (DocCluster) clusterList.get(i);
            String[] tagList = dc1.getTags().split(",");
            String newTags = "";
 
            for (j = 0; j < tagList.length; j++) {
                i1 = dc1.getTags().indexOf(tagList[j]);
                int i2 = dc1.getTags().lastIndexOf(tagList[j]);
                if (i1 == i2)
                    newTags = newTags + tagList[j] + ",";
            }
            if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) {
                newTags = newTags.substring(0, newTags.length() - 1);
            }
            dc1.setTags(newTags);
 
            dc1.setTitle("");
 
            if (this.useTagsAsTitle) {
                tagList = dc1.getTags().split(",");
                for (j = 0; (tagList != null) && (j < tagList.length); j++) {
                    if ((dc1.getTitle() + tagList[j]).length() > 16)
                        break;
                    boolean isSubstr = false;
                    for (DocCluster c : clusterList) {
                        if ((c.getTitle().length() <= 0)
                                || ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle()))))
                            continue;
                        isSubstr = true;
                        break;
                    }
                    if (!isSubstr)
                        dc1.setTitle(dc1.getTitle() + tagList[j] + ",");
                }
                if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) {
                    dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1));
                }
 
            }
 
            if (dc1.getTitle() != "")
                continue;
            dc1.setTitle(dc1.getTags());
            if (dc1.getTitle().length() <= 16)
                continue;
            String s = dc1.getTitle().substring(0, 16);
            int li = s.lastIndexOf(',');
            if (li > 0) {
                dc1.setTitle(s.substring(0, li));
            }
 
        }
 
        if (otherIdList.size() > 0) {
            DocCluster clusterOther = new DocCluster();
            clusterOther.setDocIdList(new String[otherIdList.size()]);
            clusterOther.setDocList(new ICTHit[otherIdList.size()]);
            clusterOther.setLevel(level);
            clusterOther.setTitle("其他");
            clusterOther.setTags("其他");
            i = 0;
            for (int k=0;k<otherIdList.size();k++) {
                int idx = otherIdList.get(k);
 
                clusterOther.getDocIdList()[i] = docs[idx].getDocId();
                clusterOther.getDocList()[i] = docs[idx];
                i++;
            }
            clusterList.add(clusterOther);
        }
 
        return (List<DocCluster>) clusterList;
    }
 
    public List<DocCluster> getClusters() {
        return this.clusters;
    }
 
    public void setClusters(List<DocCluster> clusters) {
        this.clusters = clusters;
    }
 
    public ICTHit[] getDocs() {
        return this.docs;
    }
 
    public void setDocs(ICTHit[] docs) {
        this.docs = docs;
    }
 
    public int getMaxLevels() {
        return this.maxLevels;
    }
 
    public void setMaxLevels(int maxLevels) {
        this.maxLevels = maxLevels;
    }
 
    public ClusteringOptions[] getOptions() {
        return this.options;
    }
 
    public void setOptions(ClusteringOptions[] options) {
        this.options = options;
    }
 
    public boolean isUseTagsAsTitle() {
        return this.useTagsAsTitle;
    }
 
    public void setUseTagsAsTitle(boolean useTagsAsTitle) {
        this.useTagsAsTitle = useTagsAsTitle;
    }
 
    public String getWordsExcluded() {
        return this.wordsExcluded;
    }
 
    public void setWordsExcluded(String wordsExcluded) {
        this.wordsExcluded = wordsExcluded;
    }
 
    private class DocValue {
        public List<Integer> idList = new ArrayList();
        public String titleListStr = "";
 
        private DocValue() {
        }
    }
    /**
     * タグIDペア、タグIDはタグドキュメントマトリクス内の主キー位置
    * @author 
    * @version 作成日時：2011-3-9 午後02:52:44
     */
    private class IdPair {
        public int Id1;
        public int Id2;
 
        public IdPair(int id1, int id2) {
            assert (id1 != id2);
            if (id1 < id2) {
                this.Id1 = id1;
                this.Id2 = id2;
            } else {
                this.Id1 = id2;
                this.Id2 = id1;
            }
        }
 
        public int hashCode() {
            return -1;
        }
 
        public boolean equals(Object o) {
            return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2);
        }
    }
 
    public static class TagHitEntry {
        public String key;
        public long[] value;
 
        public TagHitEntry() {
        }
 
        public TagHitEntry(String k, long[] v) {
            this.key = k;
            this.value = v;
        }
 
        public String getKey() {
            return this.key;
        }
 
        public long[] getValue() {
            return this.value;
        }
    }
}

ClusteringOptions.java

/**
 * 
* @author 
* @version 作成日時：2011-3-8 午前10:23:27
 */
public class ClusteringOptions {
    public static int DefMaxClusterCount = 20;
    public static int DefMaxKeywordCount = 6;
    public static int DefMinWordsRelevance = 10;
    public static int DefTagMinDocCount = 3;
    public static int DefIgnoreSameDocs = 2;
    public static int DefSameDocPercent = 50;
    public static int DefMinDocsToCluster = 8;
    private int docMaxTagCount;
    private int maxClusterCount;
    private int minDocsToCluster;
    private int minSameDocPercent;
    private int minSameDocs;
    private int minTagRelevance;
    private int tagMinDocCount;
 
    public ClusteringOptions() {
        this.maxClusterCount = DefMaxClusterCount;
        this.minTagRelevance = DefMinWordsRelevance;
        this.tagMinDocCount = DefTagMinDocCount;
        this.minSameDocs = DefIgnoreSameDocs;
        this.minSameDocPercent = DefSameDocPercent;
        this.docMaxTagCount = DefMaxKeywordCount;
        this.minDocsToCluster = DefMinDocsToCluster;
    }
 
    public int getDocMaxTagCount() {
        return this.docMaxTagCount;
    }
 
    public void setDocMaxTagCount(int docMaxTagCount) {
        this.docMaxTagCount = docMaxTagCount;
    }
 
    public int getMaxClusterCount() {
        return this.maxClusterCount;
    }
 
    public void setMaxClusterCount(int maxClusterCount) {
        this.maxClusterCount = maxClusterCount;
    }
 
    public int getMinDocsToCluster() {
        return this.minDocsToCluster;
    }
 
    public void setMinDocsToCluster(int minDocsToCluster) {
        this.minDocsToCluster = minDocsToCluster;
    }
 
    public int getMinSameDocPercent() {
        return this.minSameDocPercent;
    }
 
    public void setMinSameDocPercent(int minSameDocPercent) {
        this.minSameDocPercent = minSameDocPercent;
    }
 
    public int getMinSameDocs() {
        return this.minSameDocs;
    }
 
    public void setMinSameDocs(int minSameDocs) {
        this.minSameDocs = minSameDocs;
    }
 
    public int getMinTagRelevance() {
        return this.minTagRelevance;
    }
 
    public void setMinTagRelevance(int minTagRelevance) {
        this.minTagRelevance = minTagRelevance;
    }
 
    public int getTagMinDocCount() {
        return this.tagMinDocCount;
    }
 
    public void setTagMinDocCount(int tagMinDocCount) {
        this.tagMinDocCount = tagMinDocCount;
    }
}

DocCluster.java

/**
 * 
* @author
* @version 作成日時：2011-3-8 午前10:23:35
 */
public class DocCluster {
    private String[] docIdList;
    private ICTHit[] docList;
    private int level;
    private List<DocCluster> subclusters;
    private String tags;
    private String title;
 
    public String[] getDocIdList() {
        return this.docIdList;
    }
 
    public void setDocIdList(String[] docIdList) {
        this.docIdList = docIdList;
    }
 
    public ICTHit[] getDocList() {
        return this.docList;
    }
 
    public void setDocList(ICTHit[] docList) {
        this.docList = docList;
    }
 
    public int getLevel() {
        return level;
    }
 
    public void setLevel(int level) {
        this.level = level;
    }
 
    public List<DocCluster> getSubclusters() {
        return this.subclusters;
    }
 
    public void setSubclusters(List<DocCluster> subclusters) {
        this.subclusters = subclusters;
    }
 
    public String getTags() {
        return this.tags;
    }
 
    public void setTags(String tags) {
        this.tags = tags;
    }
 
    public String getTitle() {
        if (title == null)
            title = "";
        return this.title;
    }
 
    public void setTitle(String title) {
        this.title = title;
    }
}

ICTHit.java

public class ICTHit implements Serializable {
    /*
     * キーワード配列
     */
    private String[] TagList;
    private String docId;
    private String title;
 
    public String[] getTagList() {
        return TagList;
    }
 
    public void setTagList(String[] tagList) {
        TagList = tagList;
    }
 
    public String getDocId() {
        return docId;
    }
 
    public void setDocId(String docId) {
        this.docId = docId;
    }
 
    public String getTitle() {
        return title;
    }
 
    public void setTitle(String title) {
        this.title = title;
    }    
    
}

TagHitMatrix.java

public class TagHitMatrix extends LinkedHashMap<String, long[]> {
    /**
     * 
     */
    private static final long serialVersionUID = -7511464445378974433L;
    public static int ii = 0;
    public ClusterDocInfo[] docs;
    public int hitsItemCount;
 
    public TagHitMatrix(int DocCount, int MaxTagCount) {
        this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);
        this.docs = new ClusterDocInfo[DocCount];
 
        for (int i = 0; i < this.docs.length; i++)
            this.docs[i] = new ClusterDocInfo(MaxTagCount);
    }
 
    public void AddDocHit(String TagStr, int Position) {
        TagStr = TagStr.trim();
 
        int n = Position / 62;
        int m = Position % 62;
        long[] DocHits = (long[]) get(TagStr);
        if (DocHits == null) {
            DocHits = new long[this.hitsItemCount];
            put(TagStr, DocHits);
        }
        DocHits[n] |= Math.round(Math.pow(2.0D, m));
        ClusterDocInfo di = this.docs[Position];
        di.TagList[(di.TagCount++)] = TagStr;
    }
 
    class ClusterDocInfo {
        public String[] TagList;
        public int TagCount;
 
        public ClusterDocInfo(int MaxTagCount) {
            this.TagList = new String[MaxTagCount];
            this.TagCount = 0;
        }
    }
}

テストメソッド：

public void test(ICTHit[] icthits) throws IOException {
        ClusterBuilder clusterBuilder = new ClusterBuilder();
        // クラスタリング対象データセットを設定、テストではnullを使用
        clusterBuilder.setDocs(icthits);
        // クラスターレベルを設定、1レベルのみを使用
        clusterBuilder.setMaxLevels(10);
        clusterBuilder.setUseTagsAsTitle(true);
        // 検索語をwordsExcludedとして設定
        clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");
        clusterBuilder
                .setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() });
 
        // クラスタリング開始
        clusterBuilder.cluster();
        FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);
        BufferedWriter bw1 = new BufferedWriter(fw1);
 
        // 結果を出力
        if (clusterBuilder.getClusters() != null) {
            int i = 0;
            for (DocCluster docCluster : clusterBuilder.getClusters()) {
                i++;
                System.out.println("tag:" + docCluster.getTags() + "("
                        + docCluster.getDocIdList().length + ")");
                bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n ");                
                
                if (docCluster.getDocList() != null
                        && docCluster.getDocList().length > 0) {
                    for (ICTHit co : docCluster.getDocList()) {
                        System.out.println("     DocID: " + co.getDocId());
                        bw1.write("タイトル: "    + co.getTitle()+",ID: "+co.getDocId()+"\r\n ");    
                        for (int m = 0; m < co.getTagList().length; m++) {                            
                            bw1.write("タイトル: "    + co.getTitle()+",ID: "+co.getDocId()+"\r\n ");    
                            System.out.println("     キーワード: "
                                    + co.getTagList()[m]);
                        }
                        System.out.println("");
                    }
                    System.out.println("");
                } else {
                    bw1.write("      このカテゴリにはデータがありません！"+"\r\n ");    
                }
                bw1.write("-------------------------------------------------------------------------------\r\n");
            }
        }
        bw1.close();
        fw1.close();
    }

上記の方法はサンプルであり、本番環境での使用はされていません。しかし、中心となるメソッドは提供されています。皆様はプロジェクトに取り込んでご使用ください。carrot2標準メソッドより効果が良いです。

タグ: Java クラスタリングテキスト処理アルゴリズム自然言語処理

6月27日 22:07 投稿

異端開発室

Javaによるテキストクラスタリングの実装

ホットタグ