增加图片摘要id生成，可进行区域内图片快速相似度对比。

2024-11-29 18:27:37 +08:00 · 2022-11-21 16:21:38 +08:00 · 2022-11-21 16:21:38 +08:00 · 5d88a5157b
commit 5d88a5157b
parent 6231926a71
6 changed files with 625 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -123,6 +123,20 @@
        本演示训练素材位置在： src/test/image
        注意：以上图片识别代码样例为训练素材为物品全图充满图片(自己看能看到橘子训练图片为全图充满，苹果也是).自行开发时用以上代码样例时，请也使用全图充满训练物品的图片来做训练，非全图充满训练素材图训练api有变化！
 ```
+### 通过给图片生成摘要id进行快速相似度对比
+``` java
+//参数分别为：
+//第一个参数：threeChannelMatrix,图片矩阵（图片矩阵如何提取，上文有讲不在阐述）
+//第二个参数:boxSize,将一张图片横纵各分为几个区域提取特征
+参数说明：该值越大，摘要id敏感度越高，该参数有最大值。最大值为图片：图片最小边长/5,超过会报错数组越界
+//第三个参数:regionSize,相似特征区域分区种类数量
+参数说明：该值越大，摘要id敏感度越高
+//返回name 即为该图片摘要id，通过id逐位对比即可对比相似程度
+//什么是id敏感度：
+//id敏感度越高，对图片变化越敏感，越适合越大的检索区域匹配，即特征越细致，但缺点id长度越长。
+//id敏感度越低，对图片变化越不敏感，越适合越小的检索区域匹配,特征越粗，优点是id长度越短。
+String name = creatImageName(threeChannelMatrix, 5, 10);
+```
 ### 自然语言分类最简API 说明:
 ``` java
         //通过txt默认格式进行读取
--- a/src/main/java/org/wlld/regressionForest/LinearRegression.java
+++ b/src/main/java/org/wlld/regressionForest/LinearRegression.java
@ -0,0 +1,105 @@
+package org.wlld.regressionForest;
+
+
+import org.wlld.MatrixTools.Matrix;
+import org.wlld.MatrixTools.MatrixOperation;
+
+/**
+ * @param
+ * @DATA
+ * @Author LiDaPeng
+ * @Description rgb回归 Y = r *wr + g * wg + b* wb
+ */
+public class LinearRegression {
+    private double w1;
+    private double w2;
+    private double b;
+    private Matrix XY;//坐标矩阵
+    private Matrix NormSequence;//内积序列矩阵
+    private int xIndex = 0;//记录插入数量
+    private boolean isRegression = false;//是否进行了回归
+    private double avg;//结果平均值
+
+    public LinearRegression(int size) {//初始化rgb矩阵
+        XY = new Matrix(size, 3);
+        NormSequence = new Matrix(size, 1);
+        xIndex = 0;
+        avg = 0;
+    }
+
+    public LinearRegression() {
+
+    }
+
+    public void insertXY(double[] xy, double sequence) throws Exception {//rgb插入矩阵
+        if (xy.length == 2) {
+            XY.setNub(xIndex, 0, xy[0]);
+            XY.setNub(xIndex, 1, xy[1]);
+            XY.setNub(xIndex, 2, 1.0);
+            NormSequence.setNub(xIndex, 0, sequence);
+            xIndex++;
+        } else {
+            throw new Exception("rgb length is not equals three");
+        }
+    }
+
+    public double getValue(double x, double y) {//获取值
+        if (isRegression) {
+            return w1 * x + w2 * y + b;
+        }
+        return avg;
+    }
+
+    public double getCos(Matrix vector) throws Exception {//获取该直线与指定向量之间的余弦
+        Matrix matrix = new Matrix(1, 3);
+        matrix.setNub(0, 0, w1);
+        matrix.setNub(0, 1, w2);
+        matrix.setNub(0, 2, b);
+        return MatrixOperation.getNormCos(matrix, vector);
+    }
+
+    public void regression() throws Exception {//开始进行回归
+        if (xIndex > 0) {
+            Matrix ws = MatrixOperation.getLinearRegression(XY, NormSequence);
+            if (ws.getX() == 1 && ws.getY() == 1) {//矩阵奇异
+                isRegression = false;
+                for (int i = 0; i < xIndex; i++) {
+                    avg = avg + NormSequence.getNumber(xIndex, 0);
+                }
+                avg = avg / xIndex;
+            } else {
+                w1 = ws.getNumber(0, 0);
+                w2 = ws.getNumber(1, 0);
+                b = ws.getNumber(2, 0);
+                isRegression = true;
+            }
+            // System.out.println("wr==" + wr + ",wg==" + wg + ",b==" + b);
+        } else {
+            throw new Exception("regression matrix size is zero");
+        }
+    }
+
+    public double getW1() {
+        return w1;
+    }
+
+    public void setW1(double w1) {
+        this.w1 = w1;
+    }
+
+    public double getW2() {
+        return w2;
+    }
+
+    public void setW2(double w2) {
+        this.w2 = w2;
+    }
+
+    public double getB() {
+        return b;
+    }
+
+    public void setB(double b) {
+        this.b = b;
+    }
+}
--- a/src/main/java/org/wlld/tools/FastPictureExcerpt.java
+++ b/src/main/java/org/wlld/tools/FastPictureExcerpt.java
@ -0,0 +1,72 @@
+package org.wlld.tools;
+
+import org.wlld.MatrixTools.Matrix;
+import org.wlld.entity.ThreeChannelMatrix;
+import org.wlld.regressionForest.LinearRegression;
+
+public class FastPictureExcerpt {//图片摘要id生成
+
+    //String name = creatImageName(threeChannelMatrix, 5, 10);
+    // 图像矩阵，横纵各分多少个区域，余弦区域分几份
+    public String creatImageName(ThreeChannelMatrix threeChannelMatrix, int boxSize, int regionSize) throws Exception {//生成文件名
+        int iSize = 5;
+        Matrix vector = new Matrix(1, 3);
+        vector.setNub(0, 0, 1);
+        vector.setNub(0, 1, 0);
+        vector.setNub(0, 2, 0);
+        Matrix h = threeChannelMatrix.getH();
+        int xf = h.getX();
+        int yf = h.getY();
+        int xMO = (xf % boxSize) / 2;
+        int yMO = (yf % boxSize) / 2;
+        int xSize = xf / boxSize;
+        int ySize = yf / boxSize;
+        Matrix hr = h.getSonOfMatrix(xMO, yMO, xSize * boxSize, ySize * boxSize);
+        int x = hr.getX();
+        int y = hr.getY();
+        StringBuilder stringBuilder = new StringBuilder();
+        for (int i = 0; i <= x - xSize; i += xSize) {
+            for (int j = 0; j <= y - ySize; j += ySize) {
+                Matrix sonH = hr.getSonOfMatrix(i, j, xSize, ySize);
+                String name = getName(sonH, iSize, vector, regionSize);
+                stringBuilder.append(name);
+            }
+        }
+        return stringBuilder.toString();
+    }
+
+    private String getName(Matrix h, int iSize, Matrix vector, int regionSize) throws Exception {
+        int x = h.getX();
+        int y = h.getY();
+        int size = (x / iSize) * (y / iSize);
+        LinearRegression linearRegression = new LinearRegression(size);
+        int cPoint = iSize / 2 + 1;
+        double maxXSize = (double) x / iSize;
+        double maxYSize = (double) y / iSize;
+        double[] xy = new double[2];
+        for (int i = 0; i <= x - iSize; i += iSize) {
+            for (int j = 0; j <= y - iSize; j += iSize) {
+                double value = h.getSonOfMatrix(i, j, iSize, iSize).getNumber(cPoint, cPoint);//灰度值
+                double px = i / (double) iSize / maxXSize;
+                double py = j / (double) iSize / maxYSize;
+                xy[0] = px;
+                xy[1] = py;
+                linearRegression.insertXY(xy, value);
+            }
+        }
+        linearRegression.regression();
+        double myCos = linearRegression.getCos(vector);//余弦
+        double oneSize = 1 / (double) regionSize;//分几个区间
+        int index = 0;
+        double minSub = -1;
+        for (int i = 0; i < regionSize; i++) {
+            double cos = Math.cos(Math.PI * oneSize * i);
+            double sub = Math.abs(cos - myCos);
+            if (minSub == -1 || sub < minSub) {
+                minSub = sub;
+                index = i;
+            }
+        }
+        return String.valueOf(index);
+    }
+}
--- a/src/main/java/org/wlld/voice/MP3.java
+++ b/src/main/java/org/wlld/voice/MP3.java
@ -0,0 +1,299 @@
+package org.wlld.voice;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+public class MP3 {
+
+    /**
+     * 横坐标为MPEG(V),纵坐标为Layer(L),sample[0][2]为MPEG-1,Layer-3的每帧采样数
+     */
+
+    private final static int[][] Mp3_Sample = {{384, 384, 384},
+            {1152, 1152, 1152}, {1152, 576, 576}};
+
+    /**
+     * 二维数组长度为14,横坐标范围:1~14.MPEG1中,纵坐标分别对应Layer-1,Layer-2,Layer-3.MPEG2中,纵坐标分别对应Layer-1,Layer-2或Layer-3.
+     */
+    private final static int[][] MPeg1ByteRate = {{32, 32, 32},
+            {64, 48, 40}, {96, 56, 48}, {128, 64, 56}, {160, 80, 64},
+            {192, 96, 80}, {224, 112, 96}, {256, 128, 112},
+            {288, 160, 128}, {320, 192, 160}, {352, 224, 192},
+            {384, 256, 224}, {416, 320, 256}, {448, 384, 320}};
+
+    private final static int[][] MPeg2ByteRate = {{32, 8}, {48, 16},
+            {56, 24}, {64, 32}, {80, 40}, {96, 48}, {112, 56},
+            {128, 64}, {144, 80}, {160, 96}, {176, 112}, {192, 128},
+            {224, 144}, {256, 160}};
+
+    /**
+     * 采样频率.横坐标为变量,纵坐标为MPEG版本.
+     */
+    private final static int[][] SampleFrequency = {{44100, 48000, 32000},
+            {22050, 24000, 16000}, {11025, 12000, 8000}};
+
+    // ==================================================================================
+    private InputStream stream;
+
+    public MP3(InputStream stream) {
+        this.stream = stream;
+    }
+
+    /**
+     * MpeG版本
+     */
+    private int Mpeg_Version;
+
+    /**
+     * Layer版本
+     */
+    private int Layer_Version;
+
+    /**
+     * 采样速率.(kbps)
+     */
+    private int ByteRate;
+
+    /**
+     * 采样频率.khz
+     */
+    private int Frequency;
+
+    /**
+     * 帧长度调整值.0或1
+     */
+    private int Padding;
+
+    /**
+     * 采样位数.
+     */
+    private int Sample;
+
+    public int parserMp3Header() throws IOException {
+        FrameHeader = new int[3];
+        byte[] tempBytes = readFull(3);
+        if (tempBytes == null) {
+            return -1;
+        }
+        for (int i = 0; i < 3; i++) {
+            FrameHeader[i] = tempBytes[i] & 0xFF;
+        }
+        int TagHeaderSize = 0;
+        if (FrameHeader[0] == 'I' && FrameHeader[1] == 'D'
+                && FrameHeader[2] == '3') {
+            byte[] tagHeader = readFull(7);
+            if (tagHeader == null) {
+                return -1;
+            }
+            int tagHeaderSize = ((tagHeader[3] & 0x7F) << 21)
+                    + ((tagHeader[4] & 0x7F) << 14)
+                    + ((tagHeader[5] & 0x7F) << 7) + (tagHeader[6] & 0x7F);
+            // tagHeaderSize不包括前面10个字节.加上10以后,是整个标签头的大小.
+            if (!skipBytes(tagHeaderSize)) {
+                return -1;
+            }
+            // IDV3标签头的长度.
+            TagHeaderSize = tagHeaderSize + 10;
+            tempBytes = readFull(3);
+            if (tempBytes == null) {
+                return -1;
+            }
+            for (int i = 0; i < 3; i++) {
+                FrameHeader[i] = tempBytes[i] & 0xFF;
+            }
+        }
+        if (FrameHeader[0] != 0xFF || (FrameHeader[1] >> 5) != 7) {
+            return -1;
+        }
+        switch ((FrameHeader[1] & 24) >> 3) {
+            case 0:
+                Mpeg_Version = 3;// MPEG-2.5
+                break;
+            case 2:
+                Mpeg_Version = 2;
+                break;
+            case 3:
+                Mpeg_Version = 1;
+                break;
+            default:
+                return -1;
+        }
+        switch ((FrameHeader[1] & 6) >> 1) {
+            case 1:
+                Layer_Version = 3;
+                break;
+            case 2:
+                Layer_Version = 2;
+                break;
+            case 3:
+                Layer_Version = 1;
+                break;
+            default:
+                return -1;
+        }
+        int index = FrameHeader[2] >> 4;
+        if (index < 1 || index > 14) {
+            return -1;
+        }
+        --index;
+        if (Mpeg_Version == 1) {
+            switch (Layer_Version) {
+                case 1:
+                    ByteRate = MPeg1ByteRate[index][0];
+                    break;
+                case 2:
+                    ByteRate = MPeg1ByteRate[index][1];
+                    break;
+                case 3:
+                    ByteRate = MPeg1ByteRate[index][2];
+                    break;
+            }
+        } else {
+            switch (Layer_Version) {
+                case 1:
+                    ByteRate = MPeg2ByteRate[index][0];
+                    break;
+                case 2:
+                case 3:
+                    ByteRate = MPeg2ByteRate[index][1];
+                    break;
+            }
+        }
+        Frequency = SampleFrequency[Mpeg_Version - 1][(FrameHeader[2] & 12) >> 2];
+        Padding = (FrameHeader[2] & 2) >> 1;
+        Sample = Mp3_Sample[Layer_Version - 1][Mpeg_Version - 1];
+        return TagHeaderSize;
+    }
+
+    public long SkipFrame(long num) throws IOException {
+        long skipped = 0;
+        byte[] temp = null;
+        for (int i = 0; i < num; i++) {
+            temp = ParserFrame();
+            if (temp == null) {
+                return skipped;
+            }
+            skipped += temp.length;
+        }
+        return skipped;
+    }
+
+    /**
+     * 第一帧的前三个字节.用来提取除采样速率以外的其它定值. 在前三个字节中,第一二两个字节是不变的.只有第三个字节会左右帧长度的变化.
+     */
+    private int[] FrameHeader;
+
+    /**
+     * 发生网络数据错误,或者没有读到需要的数据,都会抛出异常.因此,外部调用程序要控制数据大小,到了MP3.length -
+     * 128的时候就不要再读了,程序不会检查是否已经到了末尾.
+     *
+     * @return byte[] 读到的一帧完整的可以播放的数据.遇到IDV1时,返回NULL
+     * @throws IOException
+     * @throws Exception
+     */
+    public byte[] ParserFrame() throws IOException {
+        byte[] byteFrameHeader;
+        int index = 0;
+        if (FrameHeader == null) {
+            byteFrameHeader = readFull(3);
+            if (byteFrameHeader == null || byteFrameHeader.length < 3
+                    || (byteFrameHeader[0] & 0xFF) == 'T'
+                    && (byteFrameHeader[1] & 0xFF) == 'A'
+                    && (byteFrameHeader[2] & 0xFF) == 'G') {
+                return null;
+            }
+            if ((byteFrameHeader[0] & 0xFF) != 0xFF
+                    || ((byteFrameHeader[1] & 0xFF) >> 5) != 7) {
+                System.out.println("该MP3文件非法.");
+                return null;// 帧头的第一个字节和第二个字节的前三位不全部为1,该MP3文件非法.
+            }
+            index = (byteFrameHeader[2] & 0xFF) >> 4;
+            Padding = (byteFrameHeader[2] & 2) >> 1;
+        } else {
+            if (FrameHeader == null || FrameHeader.length < 3
+                    || FrameHeader[0] == 'T' && FrameHeader[1] == 'A'
+                    && FrameHeader[2] == 'G') {
+                return null;
+            }
+            index = FrameHeader[2] >> 4;
+            byteFrameHeader = new byte[3];
+            byteFrameHeader[0] = (byte) FrameHeader[0];
+            byteFrameHeader[0] = (byte) FrameHeader[1];
+            byteFrameHeader[0] = (byte) FrameHeader[2];
+            FrameHeader = null;
+        }
+        if (index < 1 || index > 14) {
+            return null;// 获取位速的查找索引非法.
+        }
+        --index;
+        if (Mpeg_Version == 1) {
+            switch (Layer_Version) {
+                case 1:
+                    ByteRate = MPeg1ByteRate[index][0];
+                    break;
+                case 2:
+                    ByteRate = MPeg1ByteRate[index][1];
+                    break;
+                case 3:
+                    ByteRate = MPeg1ByteRate[index][2];
+                    break;
+            }
+        } else {
+            switch (Layer_Version) {
+                case 1:
+                    ByteRate = MPeg2ByteRate[index][0];
+                    break;
+                case 2:
+                case 3:
+                    ByteRate = MPeg2ByteRate[index][1];
+                    break;
+            }
+        }
+        // 计算帧长.(ByteRate,Frequency,Padding是可变的)
+        int frameSize = Sample / 8 * ByteRate * 1000 / Frequency + Padding;
+        byte[] temp = readFull(frameSize - 3);
+        if (temp == null) {
+            return null;
+        }
+        byte[] data = new byte[frameSize];
+        System.arraycopy(byteFrameHeader, 0, data, 0, 3);
+        System.arraycopy(temp, 0, data, 3, temp.length);
+        return data;
+    }
+
+    public byte[] readFull(int size) throws IOException {
+        byte[] data = new byte[size];
+        int n = 0;
+        while (n < size) {
+            int k = stream.read(data, n, size - n);
+            if (k < 0) {
+                break;
+            }
+            n += k;
+        }
+        if (n <= 0) {
+            return null;
+        }
+        if (n < size) {
+            byte[] temp = new byte[n];
+            System.arraycopy(data, 0, temp, 0, n);
+            data = null;
+            return temp;
+        }
+        return data;
+    }
+
+    public boolean skipBytes(long skipLength) throws IOException {
+        long k = 0;
+        while (k < skipLength) {
+            long n = stream.skip(skipLength - k);
+            if (n < 0) {
+                return false;
+            }
+            k += n;
+        }
+        return true;
+    }
+
+}
--- a/src/main/java/org/wlld/voice/VoiceTest.java
+++ b/src/main/java/org/wlld/voice/VoiceTest.java
@ -0,0 +1,15 @@
+package org.wlld.voice;
+
+import javax.sound.sampled.UnsupportedAudioFileException;
+import java.io.File;
+import java.io.IOException;
+
+public class VoiceTest {
+    public static void main(String[] args) throws UnsupportedAudioFileException, IOException {
+        File file = new File("D:\\sondTest\\wo.wav");
+        WaveFile wav = new WaveFile(file);
+        int amplitudeExample = wav.getSampleInt(140); // 140th amplitude value.
+        System.out.println("帧数:"+wav.getFramesCount());
+        System.out.println("140帧的幅度："+amplitudeExample);
+    }
+}
--- a/src/main/java/org/wlld/voice/WaveFile.java
+++ b/src/main/java/org/wlld/voice/WaveFile.java
@ -0,0 +1,120 @@
+package org.wlld.voice;
+
+import javax.sound.sampled.*;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+public class WaveFile {
+    public final int NOT_SPECIFIED = AudioSystem.NOT_SPECIFIED; // -1
+    public final int INT_SIZE = 4;
+
+    private int sampleSize = NOT_SPECIFIED;
+    private long framesCount = NOT_SPECIFIED;
+    private int sampleRate = NOT_SPECIFIED;
+    private int channelsNum;
+    private byte[] data;      // wav bytes
+    private AudioInputStream ais;
+    private AudioFormat af;
+
+    private Clip clip;
+    private boolean canPlay;
+
+    public WaveFile(File file) throws UnsupportedAudioFileException, IOException {
+        if (!file.exists()) {
+            throw new FileNotFoundException(file.getAbsolutePath());
+        }
+
+        ais = AudioSystem.getAudioInputStream(file);
+
+        af = ais.getFormat();
+
+        framesCount = ais.getFrameLength();
+
+        sampleRate = (int) af.getSampleRate();
+
+        sampleSize = af.getSampleSizeInBits() / 8;
+
+        channelsNum = af.getChannels();
+
+        long dataLength = framesCount * af.getSampleSizeInBits() * af.getChannels() / 8;
+
+        data = new byte[(int) dataLength];
+        ais.read(data);
+
+        AudioInputStream aisForPlay = AudioSystem.getAudioInputStream(file);
+        try {
+            clip = AudioSystem.getClip();
+            clip.open(aisForPlay);
+            clip.setFramePosition(0);
+            canPlay = true;
+        } catch (LineUnavailableException e) {
+            canPlay = false;
+            System.out.println("I can play only 8bit and 16bit music.");
+        }
+    }
+
+    public boolean isCanPlay() {
+        return canPlay;
+    }
+
+    public void play() {
+        clip.start();
+    }
+
+    public void stop() {
+        clip.stop();
+    }
+
+    public AudioFormat getAudioFormat() {
+        return af;
+    }
+
+    public int getSampleSize() {
+        return sampleSize;
+    }
+
+    public double getDurationTime() {
+        return getFramesCount() / getAudioFormat().getFrameRate();
+    }
+
+    public long getFramesCount() {
+        return framesCount;
+    }
+
+
+    /**
+     * Returns sample (amplitude value). Note that in case of stereo samples
+     * go one after another. I.e. 0 - first sample of left channel, 1 - first
+     * sample of the right channel, 2 - second sample of the left channel, 3 -
+     * second sample of the rigth channel, etc.
+     */
+    public int getSampleInt(int sampleNumber) {
+
+        if (sampleNumber < 0 || sampleNumber >= data.length / sampleSize) {
+            throw new IllegalArgumentException(
+                    "sample number can't be < 0 or >= data.length/"
+                            + sampleSize);
+        }
+
+        byte[] sampleBytes = new byte[4]; //4byte = int
+
+        for (int i = 0; i < sampleSize; i++) {
+            sampleBytes[i] = data[sampleNumber * sampleSize * channelsNum + i];
+        }
+
+        int sample = ByteBuffer.wrap(sampleBytes)
+                .order(ByteOrder.LITTLE_ENDIAN).getInt();
+        return sample;
+    }
+
+    public int getSampleRate() {
+        return sampleRate;
+    }
+
+    public Clip getClip() {
+        return clip;
+    }
+}