{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Scraping for Binance Announcements\n",
    "Beta version. Modified on 07-09-2019"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import all the needed packages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bs4 as bs\n",
    "import urllib.request\n",
    "import tweepy, os, time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Twitter app"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "consumer_key = os.environ.get('TW_CONSUMER_KEY')\n",
    "consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n",
    "access_token = os.environ.get('TW_ACCESS_TOKEN')\n",
    "access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n",
    "# authentication of consumer key and secret\n",
    "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
    "# authentication of access token and secret\n",
    "auth.set_access_token(access_token, access_token_secret)\n",
    "api = tweepy.API(auth)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "old_urls, news_urls = [], []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the function to extract the information from the webpage and get the matchings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_binance(main_webpage, key_words):\n",
    "    final_item, final_list = [], []\n",
    "    sauce = urllib.request.urlopen(main_webpage).read()\n",
    "    soup = bs.BeautifulSoup(sauce, 'lxml')\n",
    "    list = soup.find_all('li', class_ = 'article-list-item')\n",
    "    for article in list:\n",
    "        article_text = article.get_text().replace('\\n', '')\n",
    "        for item in key_words:\n",
    "            if item in article_text:\n",
    "                final_item.append(article_text)\n",
    "                final_item.append('https://www.binance.com' + article.find('a').get('href'))\n",
    "                final_list.append(final_item)\n",
    "                final_item = [] # Reset once is in the final_list to not get duplicates\n",
    "    return final_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the first pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 30.9 ms, sys: 349 µs, total: 31.2 ms\n",
      "Wall time: 77.1 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n",
    "old_urls = extract_binance(main_webpage, key_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loop pass and get the new announcements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done for now. Time to go to sleep mate!\n"
     ]
    }
   ],
   "source": [
    "# Loop pass - Watchdog mode\n",
    "while True:\n",
    "    new_urls = extract_binance(main_webpage, key_words)\n",
    "    for item in new_urls:\n",
    "        if item not in old_urls:\n",
    "            msg = item[0] + '\\n' + item[1]\n",
    "            api.update_status(msg)\n",
    "    print('Done for now. Time to go to sleep mate!')\n",
    "    time.sleep(900) # Check every 15 min"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}