Spaces:

tosanoob
/

arxiv_bot

No application file

App Files Files Community

tosanoob commited on May 4, 2024

Commit

2739bfd

verified ·

1 Parent(s): 829dd16

Upload crawl functions

Browse files

Files changed (1) hide show

testing_functions.ipynb +686 -0

testing_functions.ipynb ADDED Viewed

	@@ -0,0 +1,686 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import string\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "import urllib\n",
+    "import urllib.request\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'computer science': ['machine learning', 'artificial intelligence', 'hardware architecture', 'computational complexity', 'data structures', 'algorithms', 'graphics', 'databases', 'discrete mathematics', 'human-computer interaction', 'information retrieval', 'multiagent systems', 'neural network'], 'economics': ['general economics', 'theoretical economics', 'econometrics'], 'electrical engineering and system science': ['audio processing', 'speech processing', 'signal processing', 'image and video processing', 'system and controls'], 'mathematics': ['general mathematics', 'general topology', 'group theory', 'numerical analysis', 'probability', 'number theory', 'statistic theory']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "baseurl = 'http://export.arxiv.org/api/query?search_query='\n",
+    "\n",
+    "# still ambigious, what are keywords?\n",
+    "\n",
+    "timestamp = \"2020-01-01\" \n",
+    "max_results = 10000\n",
+    "date = pd.Timestamp(str(timestamp), tz='US/Pacific')\n",
+    "\n",
+    "topics = json.load(open(\"topics.txt\",\"r\"))\n",
+    "print(topics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in topics:\n",
+    "    # print(key)\n",
+    "    # prepare url for each topic\n",
+    "    keyword_list = topics[key]\n",
+    "    i = 0\n",
+    "    for keyword in keyword_list:\n",
+    "        if i ==0:\n",
+    "            url = baseurl + 'all:' + keyword\n",
+    "            i = i + 1               \n",
+    "        else:\n",
+    "            url = url + '+OR+' + 'all:' + keyword\n",
+    "    url = url+ '&max_results=' + str(max_results)\n",
+    "    url = url.replace(' ', '%20')\n",
+    "\n",
+    "    arxiv_page = urllib.request.urlopen(url,timeout=100).read()\n",
+    "    with open(key+\".xml\",\"wb\") as outfile:\n",
+    "        outfile.write(arxiv_page)\n",
+    "    print(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def crawl_from_url(url):\n",
+    "    try:        \n",
+    "        arxiv_page = urllib.request.urlopen(url,timeout=100).read()\n",
+    "        with open(\"save.xml\",\"wb\") as outfile:\n",
+    "            outfile.write(arxiv_page)\n",
+    "        arxiv_page = str(arxiv_page)   \n",
+    "        # Mỗi record nằm trong một thẻ <entry> \n",
+    "        # <id> chứa đường dẫn tới paper trên arxiv\n",
+    "        # <updated>, <published> là thời gian gần nhất cập nhật/xuất bản\n",
+    "        # <title> là tiêu đề paper\n",
+    "        # <summary> là abstract paper\n",
+    "        # có thể có nhiều thẻ <author> chứa tên các tác giả\n",
+    "        # <link title=\"pdf\" href=\" ... chứa link tải paper\n",
+    "\n",
+    "        # trích 1 record dựa vào thẻ <entry>\n",
+    "        start = arxiv_page.find(\"<entry>\")\n",
+    "        end = arxiv_page.find(\"</entry>\")\n",
+    "        extract = arxiv_page[start+7:end]\n",
+    "        # print(extract)\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(\"Error occured: \",e)\n",
+    "\n",
+    "crawl_from_url(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_tag(txt,tagname):\n",
+    "    return txt[txt.find(\"<\"+tagname+\">\")+len(tagname)+2:txt.find(\"</\"+tagname+\">\")]\n",
+    "\n",
+    "def get_record(extract):\n",
+    "    # id = extract[extract.find(\"<id>\")+4:extract.find(\"</id>\")]\n",
+    "    # updated = extract[extract.find(\"<updated>\")+9:extract.find(\"</updated>\")]\n",
+    "    # published = extract[extract.find(\"<published>\")+11:extract.find(\"</published>\")]\n",
+    "    # title = extract[extract.find(\"<title>\")+7:extract.find(\"</title>\")]\n",
+    "    # summary = extract[extract.find(\"<summary>\")+9:extract.find(\"</summary>\")]\n",
+    "    id = extract_tag(extract,\"id\")\n",
+    "    updated = extract_tag(extract,\"updated\")\n",
+    "    published = extract_tag(extract,\"published\")\n",
+    "    title = extract_tag(extract,\"title\").replace(\"\\n \",\"\").strip()\n",
+    "    summary = extract_tag(extract,\"summary\").replace(\"\\n\",\"\").strip()\n",
+    "    authors = []\n",
+    "    while extract.find(\"<author>\")!=-1:\n",
+    "        # author = extract[extract.find(\"<name>\")+6:extract.find(\"</name>\")]\n",
+    "        author = extract_tag(extract,\"name\")\n",
+    "        extract = extract[extract.find(\"</author>\")+9:]\n",
+    "        authors.append(author)\n",
+    "    pattern = '<link title=\"pdf\" href=\"'\n",
+    "    link_start = extract.find('<link title=\"pdf\" href=\"')\n",
+    "    link = extract[link_start+len(pattern):extract.find(\"rel=\",link_start)-2]\n",
+    "    return [id, updated, published, title, authors, link, summary]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'computer science': ['machine learning', 'artificial intelligence', 'hardware architecture', 'computational complexity', 'data structures', 'algorithms', 'graphics', 'databases', 'discrete mathematics', 'human-computer interaction', 'information retrieval', 'multiagent systems', 'neural network'], 'economics': ['general economics', 'theoretical economics', 'econometrics'], 'electrical engineering and system science': ['audio processing', 'speech processing', 'signal processing', 'image and video processing', 'system and controls'], 'mathematics': ['general mathematics', 'general topology', 'group theory', 'numerical analysis', 'probability', 'number theory', 'statistic theory']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load xml\n",
+    "topics = json.load(open(\"topics.txt\",\"r\"))\n",
+    "print(topics)\n",
+    "records = []\n",
+    "for key in topics:\n",
+    "    with open(key+\".xml\",\"rb\") as infile:\n",
+    "        xml = infile.read()\n",
+    "    xml = str(xml,encoding=\"utf-8\")\n",
+    "    while xml.find(\"<entry>\") != -1:\n",
+    "        extract = xml[xml.find(\"<entry>\")+7:xml.find(\"</entry>\")]\n",
+    "        xml = xml[xml.find(\"</entry>\")+8:]\n",
+    "        records.append([key,*get_record(extract)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3000\n",
+      "<class 'list'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(records))\n",
+    "print(type(records[32][5]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(records,columns=[\"topic\",\"id\",\"updated\",\"published\",\"title\",\"author\",\"link\",\"summary\",])\n",
+    "df.to_csv(\"arxiv_crawl.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "topics_descriptions = json.load(open(\"topic_descriptions.txt\",\"r\"))\n",
+    "print(topics_descriptions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embed = model.encode(\"\"\"Recommendation systems for different Document Networks (DN) such as the World\n",
+    "Wide Web (WWW) and Digital Libraries, often use distance functions extracted\n",
+    "from relationships among documents and keywords. For instance, documents in the\n",
+    "WWW are related via a hyperlink network, while documents in bibliographic\n",
+    "databases are related by citation and collaboration networks. Furthermore,\n",
+    "documents are related to keyterms. The distance functions computed from these\n",
+    "relations establish associative networks among items of the DN, referred to as\n",
+    "Distance Graphs, which allow recommendation systems to identify relevant\n",
+    "associations for individual users. However, modern recommendation systems need\n",
+    "to integrate associative data from multiple sources such as different\n",
+    "databases, web sites, and even other users. Thus, we are presented with a\n",
+    "problem of combining evidence (about associations between items) from different\n",
+    "sources characterized by distance functions. In this paper we describe our work\n",
+    "on (1) inferring relevant associations from, as well as characterizing,\n",
+    "semi-metric distance graphs and (2) combining evidence from different distance\n",
+    "graphs in a recommendation system. Regarding (1), we present the idea of\n",
+    "semi-metric distance graphs, and introduce ratios to measure semi-metric\n",
+    "behavior. We compute these ratios for several DN such as digital libraries and\n",
+    "web sites and show that they are useful to identify implicit associations.\n",
+    "Regarding (2), we describe an algorithm to combine evidence from distance\n",
+    "graphs that uses Evidence Sets, a set structure based on Interval Valued Fuzzy\n",
+    "Sets and Dempster-Shafer Theory of Evidence. This algorithm has been developed\n",
+    "for a recommendation system named TalkMine.\"\"\")\n",
+    "for topic in topics_descriptions:\n",
+    "    description = topics_descriptions[topic]\n",
+    "    embed_desc = model.encode(description)\n",
+    "    print(topic+\": \"+str(cos_sim(embed,embed_desc)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chromadb\n",
+    "from chromadb import Documents, EmbeddingFunction, Embeddings\n",
+    "\n",
+    "from transformers import AutoModel\n",
+    "from numpy.linalg import norm\n",
+    "\n",
+    "cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))\n",
+    "model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en',\n",
+    "                                  trust_remote_code=True,\n",
+    "                                  cache_dir='models') # trust_remote_code is needed to use the encode method\n",
+    "class JinaAIEmbeddingFunction(EmbeddingFunction):\n",
+    "    def __init__(self, model):\n",
+    "        super().__init__()\n",
+    "        self.model = model\n",
+    "\n",
+    "    def __call__(self, input: Documents) -> Embeddings:\n",
+    "        embeddings = self.model.encode(input)\n",
+    "        return embeddings.tolist()\n",
+    "\n",
+    "ef = JinaAIEmbeddingFunction(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = chromadb.PersistentClient(path=\"arxivdb/\")\n",
+    "# first creation, embedding function = default\n",
+    "# collection = client.create_collection(name=\"arxiv_records\",metadata={\"hnsw:space\": \"cosine\"})\n",
+    "# later call\n",
+    "collection = client.get_or_create_collection(name=\"arxiv_records\", embedding_function=ef, metadata={\"hnsw:space\": \"cosine\"})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.delete_collection(name=\"arxiv_records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3\n",
+    "con = sqlite3.connect(\"arxiv_records_sql\")\n",
+    "cur = con.cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OperationalError",
+     "evalue": "table arxivsql already exists",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOperationalError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mcur\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\"\"\u001b[39;49m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;124;43m    create table arxivsql(\u001b[39;49m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124;43m        id,\u001b[39;49m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;124;43m        topic,\u001b[39;49m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;124;43m        title,\u001b[39;49m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;124;43m        authors,\u001b[39;49m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;124;43m        year_updated,\u001b[39;49m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;124;43m        year_published,\u001b[39;49m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;124;43m        link\u001b[39;49m\n\u001b[0;32m     10\u001b[0m \u001b[38;5;124;43m    )\u001b[39;49m\n\u001b[0;32m     11\u001b[0m \u001b[38;5;124;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m     12\u001b[0m con\u001b[38;5;241m.\u001b[39mcommit()\n",
+      "\u001b[1;31mOperationalError\u001b[0m: table arxivsql already exists"
+     ]
+    }
+   ],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "    create table arxivsql(\n",
+    "        id,\n",
+    "        topic,\n",
+    "        title,\n",
+    "        authors,\n",
+    "        year_updated,\n",
+    "        year_published,\n",
+    "        link\n",
+    "    )\n",
+    "\"\"\")\n",
+    "con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cur.execute(\"drop table arxivsql\")\n",
+    "con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(3000, 8)\n",
+      "<class 'numpy.ndarray'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv(\"arxiv_crawl.csv\",index_col=0,header=0)\n",
+    "print(df.shape)\n",
+    "records = df.values\n",
+    "print(type(records))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Domenico Amato, Giosue' Lo Bosco, Raffaele Giancarl\n"
+     ]
+    }
+   ],
+   "source": [
+    "def chunk_text(text, max_char=400):\n",
+    "  \"\"\"\n",
+    "  Chunk a long text into several chunks, with each chunk about 300-400 characters long,\n",
+    "  but make sure no word is cut in half.\n",
+    "  Args:\n",
+    "      text: The long text to be chunked.\n",
+    "      max_char: The maximum number of characters per chunk (default: 400).\n",
+    "  Returns:\n",
+    "      A list of chunks.\n",
+    "  \"\"\"\n",
+    "  chunks = []\n",
+    "  current_chunk = \"\"\n",
+    "  words = text.split()\n",
+    "  for word in words:\n",
+    "    # Check if adding the word would exceed the chunk limit (including overlap)\n",
+    "    if len(current_chunk) + len(word) + 1 >= max_char:\n",
+    "        chunks.append(current_chunk)\n",
+    "        current_chunk = word\n",
+    "    else:\n",
+    "      current_chunk += \" \" + word\n",
+    "  chunks.append(current_chunk.strip())\n",
+    "  return chunks\n",
+    "\n",
+    "def process_authors(authors):\n",
+    "   text = \"\"\n",
+    "   for author in authors:\n",
+    "      text+=author+\", \"\n",
+    "   return text[:-3]\n",
+    "\n",
+    "print(process_authors(records[32][5]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "200\n",
+      "400\n",
+      "600\n",
+      "800\n",
+      "1000\n",
+      "1200\n",
+      "1400\n",
+      "1600\n",
+      "1800\n",
+      "2000\n",
+      "2200\n",
+      "2400\n",
+      "2600\n",
+      "2800\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Insert of existing embedding ID: 2111.13171v1_0\n",
+      "Insert of existing embedding ID: 2111.13171v1_1\n",
+      "Insert of existing embedding ID: 2111.13171v1_2\n",
+      "Insert of existing embedding ID: 2111.13171v1_3\n",
+      "Insert of existing embedding ID: 2111.13171v1_4\n",
+      "Add of existing embedding ID: 2111.13171v1_0\n",
+      "Add of existing embedding ID: 2111.13171v1_1\n",
+      "Add of existing embedding ID: 2111.13171v1_2\n",
+      "Add of existing embedding ID: 2111.13171v1_3\n",
+      "Add of existing embedding ID: 2111.13171v1_4\n",
+      "Insert of existing embedding ID: 2211.03756v1_0\n",
+      "Insert of existing embedding ID: 2211.03756v1_1\n",
+      "Insert of existing embedding ID: 2211.03756v1_2\n",
+      "Insert of existing embedding ID: 2211.03756v1_3\n",
+      "Insert of existing embedding ID: 2211.03756v1_4\n",
+      "Insert of existing embedding ID: 2211.03756v1_5\n",
+      "Add of existing embedding ID: 2211.03756v1_0\n",
+      "Add of existing embedding ID: 2211.03756v1_1\n",
+      "Add of existing embedding ID: 2211.03756v1_2\n",
+      "Add of existing embedding ID: 2211.03756v1_3\n",
+      "Add of existing embedding ID: 2211.03756v1_4\n",
+      "Add of existing embedding ID: 2211.03756v1_5\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3000\n"
+     ]
+    }
+   ],
+   "source": [
+    "count = 0\n",
+    "for record in records:\n",
+    "    # add to vector db\n",
+    "    embed_text = \"\"\"\n",
+    "    Topic: {},\n",
+    "    Title: {},\n",
+    "    Summary: {}\n",
+    "\"\"\".format(\n",
+    "        record[0], record[4], record[7]\n",
+    "    )\n",
+    "    chunks = chunk_text(embed_text)\n",
+    "    ids = [record[1][21:] + \"_\" + str(j) for j in range(len(chunks))]\n",
+    "    paper_ids = [{\"paper_id\": record[1][21:]} for _ in range(len(chunks))]\n",
+    "    collection.add(documents=chunks, metadatas=paper_ids, ids=ids)\n",
+    "    # try:\n",
+    "    #     query = \"\"\"insert into arxivsql values(\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\")\"\"\".format(\n",
+    "    #         record[1][21:],\n",
+    "    #         record[0],\n",
+    "    #         record[4].replace('\"', \"'\"),\n",
+    "    #         process_authors(record[5]),\n",
+    "    #         record[2][:10],\n",
+    "    #         record[3][:10],\n",
+    "    #         record[6],\n",
+    "    #     )\n",
+    "    #     cur.execute(query)\n",
+    "    #     con.commit()\n",
+    "    # except Exception as e:\n",
+    "    #     print(e)\n",
+    "    #     print(query)\n",
+    "    count += 1\n",
+    "    if count % 200 == 0:\n",
+    "        print(count)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cur.execute(\"\"\"insert into arxivsql values(\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\",\"{}\")\"\"\".format(\n",
+    "            \"1906.04027v2\", #editing\n",
+    "            \"electrical engineering and system science\",\n",
+    "            \"'Did You Hear That?'' Learning to Play Video Games from Audio Cues\",\"Raluca D. Gaina, Matthew Stephenso\",\n",
+    "            \"Hadi Abdullah, Muhammad Sajidur Rahman, Washington Garcia, Logan Blue, Kevin Warren, Anurag Swarnim Yadav, Tom Shrimpton, Patrick Trayno\",\n",
+    "            \"2019-06-11\",\n",
+    "            \"2019-06-10\",\n",
+    "            \"http://arxiv.org/pdf/1910.05262v1\"\n",
+    "        ))\n",
+    "con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'cur' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[11], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mcur\u001b[49m\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mselect * from arxivsql where True and True\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(res\u001b[38;5;241m.\u001b[39mfetchall())\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'cur' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "res = cur.execute(\"select * from arxivsql where True and True\")\n",
+    "print(res.fetchall())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10740\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(collection.count())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['2211.03756v1_0', '2211.03756v1_1', '2211.03756v1_2', '2211.03756v1_3', '2211.03756v1_4', '2211.03756v1_5', '2211.03756v1_6']\n"
+     ]
+    }
+   ],
+   "source": [
+    "id = \"2211.03756v1\"\n",
+    "ids = [\"{}_{}\".format(id,j) for j in range(0,10)]\n",
+    "results = collection.get(ids=ids)\n",
+    "print(results[\"ids\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = collection.query(\n",
+    "    query_texts = \"recommend academic articles or books related to the field of artificial intelligence, machine learning and technology for the AI intern to explore further\",\n",
+    "    where_document = {\n",
+    "        \"$or\":[\n",
+    "            {\"$contains\":\"AI\"},\n",
+    "            {\"$contains\":\"machine learning\"},\n",
+    "            {\"$contains\":\"technology\"}\n",
+    "        ]\n",
+    "    },\n",
+    "    n_results=3\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['title', 'author']\n"
+     ]
+    }
+   ],
+   "source": [
+    "args = {\"title\":\"Attention is all you need\",\n",
+    "        \"author\": \"Vaswani, Ashish and Shazeer\"}\n",
+    "keys = list(dict.keys(args))\n",
+    "print(keys)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def printline(txt, maxline = 100):\n",
+    "    for i in range(len(txt)):\n",
+    "        if i%maxline == maxline-1:\n",
+    "            print(txt[i],end=\"\\n\")\n",
+    "        else: print(txt[i],end=\"\")\n",
+    "\n",
+    "print(dict.keys(results))\n",
+    "# get metadatas\n",
+    "target = results['metadatas'][0]\n",
+    "for rec in target:\n",
+    "    print(rec['author'])\n",
+    "    print(rec['link'])\n",
+    "    printline(rec['summary'])\n",
+    "    print(\"\\n------------------------------------------\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t = target[0]\n",
+    "print(t['link'])\n",
+    "print(t['title'])\n",
+    "print(t['summary'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = '[\"AI technologies\",\"Find academic papers\"]'\n",
+    "print(list(args))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}