diff options
author | darroyolpz <darroyolpz@users.noreply.github.com> | 2019-09-07 17:59:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-09-07 17:59:00 +0200 |
commit | 37ce1116c68a136171f8eb0599f2baecf38227ed (patch) | |
tree | 715fa574eb855b61c4846431797cb1faf7decde0 | |
parent | 1bab5054c54c706469fe414b29074f20bbdabacb (diff) |
First beta version
-rw-r--r-- | Data Scraping for Binance Announcements.ipynb | 214 |
1 files changed, 214 insertions, 0 deletions
diff --git a/Data Scraping for Binance Announcements.ipynb b/Data Scraping for Binance Announcements.ipynb new file mode 100644 index 0000000..d45aaa1 --- /dev/null +++ b/Data Scraping for Binance Announcements.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Scraping for Binance Announcements\n", + "Beta version. Modified on 07-09-2019" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import all the needed packages:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import bs4 as bs\n", + "import urllib.request\n", + "import tweepy, os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Twitter app" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "consumer_key = os.environ.get('TW_CONSUMER_KEY')\n", + "consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n", + "access_token = os.environ.get('TW_ACCESS_TOKEN')\n", + "access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n", + "# authentication of consumer key and secret\n", + "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", + "# authentication of access token and secret\n", + "auth.set_access_token(access_token, access_token_secret)\n", + "api = tweepy.API(auth)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "old_urls, news_urls = [], []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the function to extract the information from the webpage and get the matchings" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_binance(main_webpage, key_words):\n", + " final_item, final_list = [], []\n", + " sauce = urllib.request.urlopen(main_webpage).read()\n", + " soup = bs.BeautifulSoup(sauce, 'lxml')\n", + " list = soup.find_all('li', class_ = 'article-list-item')\n", + " for article in list:\n", + " article_text = article.get_text().replace('\\n', '')\n", + " for item in key_words:\n", + " if item in article_text:\n", + " final_item.append(article_text)\n", + " final_item.append('https://www.binance.com' + article.find('a').get('href'))\n", + " final_list.append(final_item)\n", + " final_item = [] # Reset once is in the final_list to not get duplicates\n", + " return final_list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the first pass" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 38.2 ms, sys: 3.74 ms, total: 41.9 ms\n", + "Wall time: 147 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n", + "old_urls = extract_binance(main_webpage, key_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['Binance Lists Chiliz (CHZ)',\n", + " 'https://www.binance.com/en/support/articles/360033377831'],\n", + " ['Binance Completes Perlin Lottery Draw and Will Open Trading For PERL',\n", + " 'https://www.binance.com/en/support/articles/360032900851'],\n", + " ['Binance Lists Second BEP2 Community Listing Project - TomoChain (TOMO)',\n", + " 'https://www.binance.com/en/support/articles/360032514812'],\n", + " ['Introducing the Band Protocol (BAND) Token Sale on Binance Launchpad',\n", + " 'https://www.binance.com/en/support/articles/360033102832'],\n", + " ['Pepito', 'Fulanito']]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_urls = extract_binance(main_webpage, key_words)\n", + "a = ['Pepito', 'Fulanito']\n", + "new_urls.append(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pepito\n" + ] + } + ], + "source": [ + "for item in new_urls:\n", + " if item not in old_urls:\n", + " msg = item[0]\n", + " print(msg)\n", + " #api.update_status('Testing')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |