#first prompt and OpenAI call to extract node-relation triplets client = OpenAI()
system_text = """You are an expert Knowledge Graph developer with deep knowledge of Biology. Your job is to take a piece of text given to you and extract all the nodes and edges within that text.
The nodes include items such as scientific terms, study methods, scientific studies, chemicals, specific genes mentioned, proteins mentioned, medical treatments, side effects, diseases, any symptoms, mechanisms of action etc. You MUST create the response in the form of a triplet. For example: <NODE>--<RELATIONSHIP>--<NODE>
Here is an example of a input text and output.
INPUT TEXT: Apicomplexan parasites are thought to actively invade the host cell by gliding motility. Recent studies demonstrated that Toxoplasma gondii can invade the host cell in the absence of several core components of the invasion machinery, such as the motor protein myosin A (MyoA), the microneme proteins MIC2 and AMA1 and actin, indicating the presence of alternative invasion mechanisms. Here the roles of MyoA, MLC1, GAP45 and Act1, core components of the gliding machinery, are re-dissected in detail.
OUTPUT: <Apicomplexan parasites>--<thought to actively invade by gliding motility>--<host cell> <Apicomplexan parasites>--<uses>--<gliding motility> <host cells>--<effected by>--<gliding motility> <Toxoplasma gondii>--<type of>--<Apicomplexan parasites> <MyoA>--<type of>--<motor protein> <MIC2>--<type of>--<microneme protein> <AMA1>--<type of>--<microneme protein> <actin>--<type of>--<microneme protein> <MyoA>--<component of >--<gliding machinery> <MLC1>--<component of >--<gliding machinery> <GAP45>--<component of >--<gliding machinery> <Act1>--<component of >--<gliding machinery> """
system_text = """You are an expert Medical Researcher with deep knowledge on BioMedical and Health related concepts and terms. Your job is to take a list of medical terms and provide the list of MeSH(Medical Subject Headings) terms tied to them. If a term does not have a MeSH term return "NONE"
You MUST create the response in the form of a triplet. For example: INPUT TERM<--->MeSH TERM
Here is an example of a input text and output.
INPUT TEXT: Cancer Human Gut Bob Smith injury to the esophagus caused by acid reflux
OUTPUT: Cancer<--->Neoplasm Human Gut<--->Gastrointestinal Tract Bob Smith<--->NONE injury to the esophagus caused by acid reflux<--->Reflux Esophagitis """
在标准化节点的过程中,我确实丢失了一些信息。例如,“tumorigenesis”、“918 cancer samples”和“132 cancer types”都被翻译为“Neoplasms”。然而,我觉得我很可能会将这个知识图用作RAG系统的辅助,并且仍然会保留原始文本,所以对我来说这是可以接受的权衡。
edges = [] labels = [] edge_labels = {} # stores a list of labels for each edge edge_labels_with_doi = {} # stores a list of labels for each edge - label includes doi node_edge_dict = {}# holds a list of edges tied to a node. Will help with searching in the future
for line in lines: nodes = line.split(">--<") node1 = nodes[0].replace("<","") node1 = stnd_dict[node1] node2 = nodes[2].replace(">","") node2 = stnd_dict[node2] edge = (node1, node2) label = nodes[1] edges.append(edge) labels.append(label)
#append the labels to the edges if (edge in edge_labels): temp_label = edge_labels[edge] temp_label.append(label) edge_labels[edge] = temp_label
#append the edges to the node dictionary. This will ease retrieval of edges for a node in subsequent steps
if (node1 in node_edge_dict): temp_edge_list = node_edge_dict[node1] temp_edge_list.append(edge) node_edge_dict[node1] = temp_edge_list else: edge_list = [] edge_list.append(edge) node_edge_dict[node1] = edge_list
if (node2 in node_edge_dict): temp_edge_list = node_edge_dict[node2] temp_edge_list.append(edge) node_edge_dict[node2] = temp_edge_list else: edge_list = [] edge_list.append(edge) node_edge_dict[node2] = edge_list
使用字典允许您快速访问与节点相关联的边缘,而无需求助图形数据库。
TEXT_OF_INTEREST = "Alternative Splicing" if (TEXT_OF_INTEREST in node_edge_dict): print(node_edge_dict[TEXT_OF_INTEREST]) for i in node_edge_dict[TEXT_OF_INTEREST]: print(edge_labels[i])
您还应该能够轻松地从连接的节点延伸出去,构建到指定深度的子图。
为了访问和使用便利,您可以使用NetworkX库。以下代码是创建有向多图的基本方法。
import networkx as nx import matplotlib.pyplot as plt
# Create an empty graph G = nx.MultiDiGraph()
# Add nodes and edges
for e in edge_labels: G.add_edge(e[0], e[1], radius = 0.15, label = edge_labels[e])
pos = nx.spring_layout(G)# Choose a layout f = plt.figure()
#adjust the figure size f.set_figwidth(15) f.set_figheight(15)
# Draw curved edges with different radii for u, v, attrs in G.edges(data=True): radius = attrs['radius'] nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], connectionstyle=f'arc3, rad={radius}', label=attrs['label'])
# Draw edge labels edge_labels = {(u, v): attrs['label'] for u, v, attrs in G.edges(data=True)} nx.draw_networkx_edge_labels(G, pos, edge_labels)