{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Scraping for Binance Announcements\n", "Beta version. Modified on 07-09-2019" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import all the needed packages:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "import bs4 as bs\n", "import urllib.request\n", "import tweepy, os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Twitter app" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "consumer_key = os.environ.get('TW_CONSUMER_KEY')\n", "consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n", "access_token = os.environ.get('TW_ACCESS_TOKEN')\n", "access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n", "# authentication of consumer key and secret\n", "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", "# authentication of access token and secret\n", "auth.set_access_token(access_token, access_token_secret)\n", "api = tweepy.API(auth)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "old_urls, news_urls = [], []" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create the function to extract the information from the webpage and get the matchings" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def extract_binance(main_webpage, key_words):\n", " final_item, final_list = [], []\n", " sauce = urllib.request.urlopen(main_webpage).read()\n", " soup = bs.BeautifulSoup(sauce, 'lxml')\n", " list = soup.find_all('li', class_ = 'article-list-item')\n", " for article in list:\n", " article_text = article.get_text().replace('\\n', '')\n", " for item in key_words:\n", " if item in article_text:\n", " final_item.append(article_text)\n", " final_item.append('https://www.binance.com' + article.find('a').get('href'))\n", " final_list.append(final_item)\n", " final_item = [] # Reset once is in the final_list to not get duplicates\n", " return final_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the first pass" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 38.2 ms, sys: 3.74 ms, total: 41.9 ms\n", "Wall time: 147 ms\n" ] } ], "source": [ "%%time\n", "main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n", "old_urls = extract_binance(main_webpage, key_words)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Binance Lists Chiliz (CHZ)',\n", " 'https://www.binance.com/en/support/articles/360033377831'],\n", " ['Binance Completes Perlin Lottery Draw and Will Open Trading For PERL',\n", " 'https://www.binance.com/en/support/articles/360032900851'],\n", " ['Binance Lists Second BEP2 Community Listing Project - TomoChain (TOMO)',\n", " 'https://www.binance.com/en/support/articles/360032514812'],\n", " ['Introducing the Band Protocol (BAND) Token Sale on Binance Launchpad',\n", " 'https://www.binance.com/en/support/articles/360033102832'],\n", " ['Pepito', 'Fulanito']]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_urls = extract_binance(main_webpage, key_words)\n", "a = ['Pepito', 'Fulanito']\n", "new_urls.append(a)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pepito\n" ] } ], "source": [ "for item in new_urls:\n", " if item not in old_urls:\n", " msg = item[0]\n", " print(msg)\n", " #api.update_status('Testing')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }