aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordarroyolpz <darroyolpz@users.noreply.github.com>2019-09-07 17:59:00 +0200
committerGitHub <noreply@github.com>2019-09-07 17:59:00 +0200
commit37ce1116c68a136171f8eb0599f2baecf38227ed (patch)
tree715fa574eb855b61c4846431797cb1faf7decde0
parent1bab5054c54c706469fe414b29074f20bbdabacb (diff)
First beta version
-rw-r--r--Data Scraping for Binance Announcements.ipynb214
1 files changed, 214 insertions, 0 deletions
diff --git a/Data Scraping for Binance Announcements.ipynb b/Data Scraping for Binance Announcements.ipynb
new file mode 100644
index 0000000..d45aaa1
--- /dev/null
+++ b/Data Scraping for Binance Announcements.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Data Scraping for Binance Announcements\n",
+ "Beta version. Modified on 07-09-2019"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Import all the needed packages:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import bs4 as bs\n",
+ "import urllib.request\n",
+ "import tweepy, os"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Twitter app"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "consumer_key = os.environ.get('TW_CONSUMER_KEY')\n",
+ "consumer_secret = os.environ.get('TW_CONSUMER_SECRET')\n",
+ "access_token = os.environ.get('TW_ACCESS_TOKEN')\n",
+ "access_token_secret = os.environ.get('TW_ACCESS_TOKEN_SECRET')\n",
+ "# authentication of consumer key and secret\n",
+ "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n",
+ "# authentication of access token and secret\n",
+ "auth.set_access_token(access_token, access_token_secret)\n",
+ "api = tweepy.API(auth)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create two empty lists for storing news urls. Message should be sent when there is a new item in the news_urls that wasn't in the old_urls"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "old_urls, news_urls = [], []"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create a bag of key words for getting matches. Don't use plurals, otherwise will get duplicates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "key_words = ['List', 'list', 'Token Sale', 'Open Trading', 'open trading']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create the function to extract the information from the webpage and get the matchings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_binance(main_webpage, key_words):\n",
+ " final_item, final_list = [], []\n",
+ " sauce = urllib.request.urlopen(main_webpage).read()\n",
+ " soup = bs.BeautifulSoup(sauce, 'lxml')\n",
+ " list = soup.find_all('li', class_ = 'article-list-item')\n",
+ " for article in list:\n",
+ " article_text = article.get_text().replace('\\n', '')\n",
+ " for item in key_words:\n",
+ " if item in article_text:\n",
+ " final_item.append(article_text)\n",
+ " final_item.append('https://www.binance.com' + article.find('a').get('href'))\n",
+ " final_list.append(final_item)\n",
+ " final_item = [] # Reset once is in the final_list to not get duplicates\n",
+ " return final_list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Get the first pass"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 38.2 ms, sys: 3.74 ms, total: 41.9 ms\n",
+ "Wall time: 147 ms\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "main_webpage = 'https://www.binance.com/en/support/categories/115000056351'\n",
+ "old_urls = extract_binance(main_webpage, key_words)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['Binance Lists Chiliz (CHZ)',\n",
+ " 'https://www.binance.com/en/support/articles/360033377831'],\n",
+ " ['Binance Completes Perlin Lottery Draw and Will Open Trading For PERL',\n",
+ " 'https://www.binance.com/en/support/articles/360032900851'],\n",
+ " ['Binance Lists Second BEP2 Community Listing Project - TomoChain (TOMO)',\n",
+ " 'https://www.binance.com/en/support/articles/360032514812'],\n",
+ " ['Introducing the Band Protocol (BAND) Token Sale on Binance Launchpad',\n",
+ " 'https://www.binance.com/en/support/articles/360033102832'],\n",
+ " ['Pepito', 'Fulanito']]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_urls = extract_binance(main_webpage, key_words)\n",
+ "a = ['Pepito', 'Fulanito']\n",
+ "new_urls.append(a)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pepito\n"
+ ]
+ }
+ ],
+ "source": [
+ "for item in new_urls:\n",
+ " if item not in old_urls:\n",
+ " msg = item[0]\n",
+ " print(msg)\n",
+ " #api.update_status('Testing')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}