mirror of
https://gitee.com/mymagicpower/AIAS.git
synced 2024-11-29 18:58:16 +08:00
rdkit java
This commit is contained in:
parent
be8bbf6396
commit
3f5bef7cc5
40
biology_sdks/rdkit_java/README.md
Normal file
40
biology_sdks/rdkit_java/README.md
Normal file
@ -0,0 +1,40 @@
|
||||
## 化学信息学的开源工具包
|
||||
RDKit是一个用于化学信息学的开源工具包,基于对化合物2D和3D分子操作,利用机器学习方法进行化合物描述符生成,
|
||||
fingerprint生成,化合物结构相似性计算,2D和3D分子展示等。
|
||||
将化学与机器学习联系起来的、非常实用的库。可以在很多种化学文件如mol2,mol,Smiles,sdf等之间互相转化,并能将其展示成2D、3D等形式供开发人员使用。
|
||||
这里给出一个java实现。
|
||||
|
||||
![img](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/biology_sdks/rdkit.jpeg)
|
||||
|
||||
### 例子包括
|
||||
- 读写分子
|
||||
- 特征提取 & 分子相似性计算
|
||||
相似度计算给出了三种计算方式的例子。
|
||||
![img](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/biology_sdks/mol.png)
|
||||
|
||||
## 运行例子 - SimpleSmilesExample
|
||||
运行成功后,命令行应该看到下面的信息:
|
||||
```text
|
||||
[INFO ] - smi1: c1ccccc1
|
||||
[INFO ] - smi2: c1ccccn1
|
||||
[INFO ] - AllBitSimilarity: 0.98681640625
|
||||
[INFO ] - CosineSimilarity: 0.4147806778921701
|
||||
[INFO ] - DiceSimilarity: 0.5454545454545454
|
||||
|
||||
```
|
||||
|
||||
### 依赖库
|
||||
[下载本地依赖库](https://djl-model.oss-cn-hongkong.aliyuncs.com/AIAS/biology_sdks/native.zip)
|
||||
Java的System.load 和 System.loadLibrary都可以用来加载库文件。
|
||||
如果使用System.loadLibrary:参数为库文件名
|
||||
例如你可以这样载入一个windows平台下JNI库文件 System.loadLibrary ("GraphMolWrap"),
|
||||
这里GraphMolWrap必须在 java.library.path这一jvm变量所指向的路径中。
|
||||
- 默认情况下,Windows平台下包含下面的路径:
|
||||
1)和jre相关的目录
|
||||
2)程序当前目录
|
||||
3)Windows目录
|
||||
4)系统目录(system32)
|
||||
5)系统环境变量path指定的目录
|
||||
|
||||
- 在linux下添加一个java.library.path的方法如下:
|
||||
在/etc/profile 后面加上一行 export LB_LIBRARY_PATH=路径
|
BIN
biology_sdks/rdkit_java/lib/org.RDKit.jar
Executable file
BIN
biology_sdks/rdkit_java/lib/org.RDKit.jar
Executable file
Binary file not shown.
100
biology_sdks/rdkit_java/pom.xml
Normal file
100
biology_sdks/rdkit_java/pom.xml
Normal file
@ -0,0 +1,100 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one
|
||||
~ or more contributor license agreements. See the NOTICE file
|
||||
~ distributed with this work for additional information
|
||||
~ regarding copyright ownership. The ASF licenses this file
|
||||
~ to you under the Apache License, Version 2.0 (the
|
||||
~ "License"); you may not use this file except in compliance
|
||||
~ with the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing,
|
||||
~ software distributed under the License is distributed on an
|
||||
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
~ KIND, either express or implied. See the License for the
|
||||
~ specific language governing permissions and limitations
|
||||
~ under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>calvin</groupId>
|
||||
<artifactId>rdkit-java</artifactId>
|
||||
<version>0.1</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
<version>3.8.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
<version>2.8.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.30</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-cli</groupId>
|
||||
<artifactId>commons-cli</artifactId>
|
||||
<version>1.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j-impl</artifactId>
|
||||
<version>2.12.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>rdkit</groupId>
|
||||
<artifactId>rdkit</artifactId>
|
||||
<version>1.0</version>
|
||||
<scope>system</scope>
|
||||
<systemPath>${project.basedir}/lib/org.RDKit.jar</systemPath>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.18</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.testng</groupId>
|
||||
<artifactId>testng</artifactId>
|
||||
<version>6.8.1</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
45
biology_sdks/rdkit_java/rdkit_java.iml
Normal file
45
biology_sdks/rdkit_java/rdkit_java.iml
Normal file
@ -0,0 +1,45 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="CheckStyle-IDEA-Module">
|
||||
<option name="configuration">
|
||||
<map />
|
||||
</option>
|
||||
</component>
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="aias-jieba-lib-0.1.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.google.code.gson:gson:2.8.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
|
||||
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" level="project" />
|
||||
<orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.logging.log4j:log4j-core:2.12.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
|
||||
<orderEntry type="module-library">
|
||||
<library name="Maven: rdkit:rdkit:1.0">
|
||||
<CLASSES>
|
||||
<root url="jar://$MODULE_DIR$/lib/org.RDKit.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
</library>
|
||||
</orderEntry>
|
||||
<orderEntry type="library" scope="PROVIDED" name="Maven: org.projectlombok:lombok:1.18.18" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.testng:testng:6.8.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.beanshell:bsh:2.0b4" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.beust:jcommander:1.27" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.yaml:snakeyaml:1.6" level="project" />
|
||||
</component>
|
||||
</module>
|
@ -0,0 +1,54 @@
|
||||
package me.calvin.example;
|
||||
|
||||
import org.RDKit.ExplicitBitVect;
|
||||
import org.RDKit.RDKFuncs;
|
||||
import org.RDKit.RWMol;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author Calvin <179209347@qq.com>
|
||||
*/
|
||||
public class SimpleSmilesExample {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SimpleSmilesExample.class);
|
||||
|
||||
static {
|
||||
try {
|
||||
//For mac
|
||||
System.load("/path/to/native/macosx.x86_64/libGraphMolWrap.jnilib");
|
||||
} catch (UnsatisfiedLinkError e) {
|
||||
System.err.println("Native code library failed to load.\n" + e);
|
||||
System.exit(1);
|
||||
}
|
||||
// System.loadLibrary("GraphMolWrap");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String smi1 = "c1ccccc1";
|
||||
String smi2 = "c1ccccn1";
|
||||
|
||||
logger.info("smi1: {}", smi1);
|
||||
logger.info("smi2: {}", smi2);
|
||||
|
||||
//读写分子
|
||||
RWMol m1 = RWMol.MolFromSmiles(smi1);
|
||||
RWMol m2 = RWMol.MolFromSmiles(smi2);
|
||||
ExplicitBitVect fp1 = RDKFuncs.RDKFingerprintMol(m1);
|
||||
ExplicitBitVect fp2 = RDKFuncs.RDKFingerprintMol(m2);
|
||||
//计算分子相似性
|
||||
double dis = RDKFuncs.AllBitSimilarity(fp1, fp2);
|
||||
logger.info("AllBitSimilarity: {}", dis);
|
||||
|
||||
dis = RDKFuncs.CosineSimilarity(fp1, fp2);
|
||||
logger.info("CosineSimilarity: {}", dis);
|
||||
|
||||
fp1 = RDKFuncs.MACCSFingerprintMol(m1);
|
||||
fp2 = RDKFuncs.MACCSFingerprintMol(m2);
|
||||
dis = RDKFuncs.DiceSimilarity(fp1, fp2);
|
||||
//Dice距离用于度量两个集合的相似性,因为可以把字符串理解为一种集合,因此Dice距离也会用于度量字符串的相似性。
|
||||
logger.info("DiceSimilarity: {}", dis);
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import org.RDKit.ExplicitBitVect;
|
||||
import org.RDKit.RDKFuncs;
|
||||
import org.RDKit.ROMol;
|
||||
import org.RDKit.RWMol;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* @author Calvin <179209347@qq.com>
|
||||
*/
|
||||
public class BytesUtils {
|
||||
/**
|
||||
* 将byte数组转换成string类型表示
|
||||
*
|
||||
* @param src
|
||||
* @return
|
||||
*/
|
||||
private static String bytesToHexString(byte[] src) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
if (src == null || src.length <= 0) {
|
||||
return null;
|
||||
}
|
||||
String hv;
|
||||
for (int i = 0; i < src.length; i++) {
|
||||
// 以十六进制(基数 16)无符号整数形式返回一个整数参数的字符串表示形式,并转换为大写
|
||||
hv = Integer.toHexString(src[i] & 0xFF).toUpperCase();
|
||||
if (hv.length() < 2) {
|
||||
builder.append(0);
|
||||
}
|
||||
builder.append(hv);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 将Hex String转换为Byte数组
|
||||
*
|
||||
* @param hexString the hex string
|
||||
* @return the byte [ ]
|
||||
*/
|
||||
public static byte[] hexStringToBytes(String hexString) {
|
||||
if (StringUtils.isEmpty(hexString)) {
|
||||
return null;
|
||||
}
|
||||
hexString = hexString.toLowerCase();
|
||||
final byte[] byteArray = new byte[hexString.length() >> 1];
|
||||
int index = 0;
|
||||
for (int i = 0; i < hexString.length(); i++) {
|
||||
if (index > hexString.length() - 1) {
|
||||
return byteArray;
|
||||
}
|
||||
byte highDit = (byte) (Character.digit(hexString.charAt(index), 16) & 0xFF);
|
||||
byte lowDit = (byte) (Character.digit(hexString.charAt(index + 1), 16) & 0xFF);
|
||||
byteArray[i] = (byte) (highDit << 4 | lowDit);
|
||||
index += 2;
|
||||
}
|
||||
return byteArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将Byte数组转换为 ByteBuffer
|
||||
*
|
||||
* @param value the byte [
|
||||
* @return ByteBuffer
|
||||
*/
|
||||
public static ByteBuffer bytesToByteBuffer(byte[] value) {
|
||||
ByteBuffer byteBuffer = ByteBuffer.allocate(value.length);
|
||||
byteBuffer.clear();
|
||||
byteBuffer.get(value, 0, value.length);
|
||||
return byteBuffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 把byte数组转为bit字符串
|
||||
*
|
||||
* @param bytes the byte
|
||||
* @return String
|
||||
*/
|
||||
public static String bytesToBit(byte[] bytes) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (byte b : bytes) {
|
||||
String bitStr = BytesUtils.byteToBit(b);
|
||||
sb.append(bitStr);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 把byte转为bit字符串
|
||||
*
|
||||
* @param b the byte
|
||||
* @return String
|
||||
*/
|
||||
public static String byteToBit(byte b) {
|
||||
return ""
|
||||
+ (byte) ((b >> 7) & 0x1) + (byte) ((b >> 6) & 0x1)
|
||||
+ (byte) ((b >> 5) & 0x1) + (byte) ((b >> 4) & 0x1)
|
||||
+ (byte) ((b >> 3) & 0x1) + (byte) ((b >> 2) & 0x1)
|
||||
+ (byte) ((b >> 1) & 0x1) + (byte) ((b >> 0) & 0x1);
|
||||
}
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import org.RDKit.ExplicitBitVect;
|
||||
import org.RDKit.RDKFuncs;
|
||||
import org.RDKit.ROMol;
|
||||
import org.RDKit.RWMol;
|
||||
|
||||
/**
|
||||
* Dice距离用于度量两个集合的相似性,因为可以把字符串理解为一种集合,因此Dice距离也会用于度量字符串的相似性。
|
||||
*/
|
||||
public class DiceUtils {
|
||||
/**
|
||||
* calculate Dice Distance between two strings
|
||||
*
|
||||
* @param smi1 the 1st string
|
||||
* @param smi2 the 2nd string
|
||||
* @return Dice Distance between smi1 and smi2
|
||||
* @author
|
||||
*/
|
||||
public static double getDistance(String smi1, String smi2) {
|
||||
ROMol m1 = RWMol.MolFromSmiles(smi1);
|
||||
ROMol m2 = RWMol.MolFromSmiles(smi2);
|
||||
ExplicitBitVect fp1 = RDKFuncs.MACCSFingerprintMol(m1);
|
||||
ExplicitBitVect fp2 = RDKFuncs.MACCSFingerprintMol(m2);
|
||||
return RDKFuncs.DiceSimilarity(fp1, fp2);
|
||||
}
|
||||
}
|
@ -0,0 +1,92 @@
|
||||
/**
|
||||
* 在信息理论中,两个等长字符串之间的汉明距离
|
||||
* 是两个字符串对应位置上不同字符的个数,
|
||||
* 换句话说,汉明距离就是将一个字符串替换成另外一个字符串所需要替换的字符长度。
|
||||
* 例如,1011101和1001001之间的汉明距离是2,
|
||||
* toned和roses之间的汉明距离是3.
|
||||
* 汉明权重是字符串相对于同样长度的零字符串的汉明距离,
|
||||
* 也就是说,它是字符串中非零的元素个数:对于二进制字符串来说,就是 1 的个数,
|
||||
* 所以 11101 的汉明重量是 4。
|
||||
* 下面的代码展示了在Java中如何计算汉明距离和汉明重量。
|
||||
*/
|
||||
package me.calvin.example.utils;
|
||||
|
||||
import org.RDKit.ExplicitBitVect;
|
||||
import org.RDKit.RDKFuncs;
|
||||
import org.RDKit.RWMol;
|
||||
|
||||
public class HammingUtils {
|
||||
public static void main(String[] args) {
|
||||
String str1 = "abcdefg";
|
||||
String str2 = "aacceeg";
|
||||
HammingUtils hd = new HammingUtils();
|
||||
int distance = hd.getDistance(str1, str2);
|
||||
System.out.println("distance is " + distance);
|
||||
int weight = hd.getWeight(255);
|
||||
System.out.println("weight is " + weight);
|
||||
|
||||
|
||||
String smi1 = "c1ccccc1";
|
||||
String smi2 = "c1ccccn1";
|
||||
String bits1 = bytesToBit(smi1);
|
||||
String bits2 = bytesToBit(smi2);
|
||||
distance = HammingUtils.getDistance(bits1, bits2);
|
||||
//两个字符串对应位置上不同字符的个数
|
||||
System.out.println("Hamming distance: " + distance);
|
||||
}
|
||||
|
||||
/**
|
||||
* 把smiles转为bit字符串
|
||||
*
|
||||
* @param smiles
|
||||
* @return String
|
||||
*/
|
||||
public static String bytesToBit(String smiles) {
|
||||
RWMol mol = RWMol.MolFromSmiles(smiles);
|
||||
ExplicitBitVect bitVect = RDKFuncs.getMorganFingerprintAsBitVect(mol, 2, 128);
|
||||
// String v = RDKFuncs.BitVectToText(bitVect);
|
||||
String fps = RDKFuncs.BitVectToFPSText(bitVect);
|
||||
byte[] bytes = BytesUtils.hexStringToBytes(fps);
|
||||
String bits = BytesUtils.bytesToBit(bytes);
|
||||
// logger.info("smiles {} bits: {}", smiles, bits);
|
||||
return bits;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate Hamming Distance between two strings
|
||||
*
|
||||
* @param str1 the 1st string
|
||||
* @param str2 the 2nd string
|
||||
* @return Hamming Distance between str1 and str2
|
||||
* @author
|
||||
*/
|
||||
public static int getDistance(String str1, String str2) {
|
||||
int distance;
|
||||
if (str1.length() != str2.length()) {
|
||||
distance = -1;
|
||||
} else {
|
||||
distance = 0;
|
||||
for (int i = 0; i < str1.length(); i++) {
|
||||
if (str1.charAt(i) != str2.charAt(i)) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate Hamming weight for binary number
|
||||
*
|
||||
* @param i the binary number
|
||||
* @return Hamming weight of the binary number
|
||||
* @author
|
||||
*/
|
||||
public static int getWeight(int i) {
|
||||
int n;
|
||||
for (n = 0; i > 0; n++) {
|
||||
i &= (i - 1);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
17
biology_sdks/rdkit_java/src/main/resources/log4j2.xml
Normal file
17
biology_sdks/rdkit_java/src/main/resources/log4j2.xml
Normal file
@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Configuration status="INFO">
|
||||
<Appenders>
|
||||
<Console name="console" target="SYSTEM_OUT">
|
||||
<PatternLayout
|
||||
pattern="[%-5level] - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
<Loggers>
|
||||
<Root level="info" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Root>
|
||||
<Logger name="me.calvin" level="${sys:me.calvin.logging.level:-info}" additivity="false">
|
||||
<AppenderRef ref="console"/>
|
||||
</Logger>
|
||||
</Loggers>
|
||||
</Configuration>
|
Loading…
Reference in New Issue
Block a user