In [1]:
%matplotlib inline
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
from sys import argv
from urllib.request import urlopen
from urllib.error import HTTPError
import requests
import itertools
import datetime as dt

Define Function and Setting

In [2]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return list(itertools.zip_longest(*args, fillvalue=fillvalue))
In [3]:
def get_item(soup):
    return soup.find('table', attrs={'summary':"외국인 기관 순매매 거래량에 관한표이며 날짜별로 정보를 제공합니다."}).text.strip()
In [4]:
def get_num(soup):
    return soup.find('table', attrs={'class':"type2"}).text.strip()
In [5]:
column = [
 '날짜',
 '종가',
 '전일비',
 '등락률',
 '거래량',
 '기관 순매매량',
 '외국인 순매매량',
 '외국인 보유주수',
 '외국인 보유율']
In [6]:
column2 = ['날짜',
 '종가',
 '전일비',
 '시가',
 '고가',
 '저가',
 '거래량']

Stock Code, length of Stock

In [7]:
code="035720"  #stock code
In [8]:
length = 20       #length of stock. 1page => about 20~30days.

Parshing

Foreign and institutional volume

In [9]:
stock = []
In [10]:
for num in np.arange(1,length):
    page = requests.get("https://finance.naver.com/item/frgn.nhn?code={0}&page={1}".format(code,num))
    data = page.content
    stock_surround = BeautifulSoup(data, "html.parser")
    a = get_item(stock_surround).splitlines()
    ab = [i.split('\t')[-1] for i in a]
    stockdata = list(filter(lambda item: item.strip(), ab))[12:]
    smalldata = grouper(stockdata,9)
    for i in range(len(smalldata)):
        stock.append(smalldata[i])
In [11]:
stockDF = pd.DataFrame(data=stock, columns=column)

Stock Data

In [12]:
stock2 = []
In [13]:
for num in np.arange(1,length*2-1):
    page2 = requests.get("https://finance.naver.com/item/sise_day.nhn?code={0}&page={1}".format(code,num))
    data2 = page2.content
    stock_surround2 = BeautifulSoup(data2, "html.parser")
    a2 = get_num(stock_surround2).splitlines()
    ab2 = [i.split('\t')[-1] for i in a2]
    stocktable2 = list(filter(lambda item: item.strip(), ab2))
    stockdata2 = stocktable2[7:]
    smalldata2 = grouper(stockdata2,7)
    for i in range(len(smalldata2)):
        stock2.append(smalldata2[i])
In [14]:
stockDF2 = pd.DataFrame(data=stock2, columns=column2)

data merge

In [15]:
stockDF2 = stockDF2.drop(["전일비"],axis=1)
In [16]:
stockDF = stockDF.drop(["종가","거래량","전일비","외국인 보유주수"],axis=1)
In [17]:
totalstock = pd.merge(stockDF, stockDF2, how='inner', on='날짜')

data wrangling

In [18]:
totalstock["종가"] = totalstock["종가"].str.replace(",","").astype(float) 
In [19]:
totalstock["시가"] = totalstock["시가"].str.replace(",","").astype(float) 
In [20]:
totalstock["고가"] = totalstock["고가"].str.replace(",","").astype(float) 
In [21]:
totalstock["저가"] = totalstock["저가"].str.replace(",","").astype(float) 
In [22]:
totalstock["거래량"] = totalstock["거래량"].str.replace(",","").astype(float) 
In [23]:
totalstock["외국인 순매매량"] = totalstock["외국인 순매매량"].str.replace(",","").astype(float) 
In [24]:
totalstock["기관 순매매량"] = totalstock["기관 순매매량"].str.replace(",","").astype(float) 
In [25]:
totalstock["외국인 보유율"] = totalstock["외국인 보유율"].str[:-1].astype(float)
In [26]:
totalstock.등락률 = totalstock.등락률.str[:-1].astype(float)
In [27]:
totalstock["날짜"] = pd.to_datetime(stockDF.날짜)
totalstock["기간"] = totalstock["날짜"]- totalstock.iloc[-1].날짜
totalstock["기간"] = totalstock["기간"].dt.days
In [28]:
totalstock
Out[28]:
날짜 등락률 기관 순매매량 외국인 순매매량 외국인 보유율 종가 시가 고가 저가 거래량 기간
0 2018-08-17 -0.39 65833.0 -7821.0 24.84 127000.0 128500.0 129000.0 126500.0 325943.0 568
1 2018-08-16 0.00 107192.0 -28291.0 24.85 127500.0 124500.0 128000.0 124000.0 478674.0 567
2 2018-08-14 2.82 117535.0 -16977.0 24.91 127500.0 125000.0 128500.0 123000.0 518080.0 565
3 2018-08-13 -3.12 25179.0 -4321.0 24.93 124000.0 126500.0 127500.0 122500.0 636617.0 564
4 2018-08-10 1.99 419755.0 36761.0 24.99 128000.0 125500.0 129500.0 124000.0 1269641.0 561
5 2018-08-09 5.46 325660.0 199748.0 24.93 125500.0 121000.0 126000.0 119500.0 1622449.0 560
6 2018-08-08 -0.83 105446.0 -15822.0 24.65 119000.0 122000.0 124500.0 118000.0 1216684.0 559
7 2018-08-07 5.73 57785.0 -59047.0 24.69 120000.0 114000.0 120000.0 113000.0 742353.0 558
8 2018-08-06 0.44 17136.0 15476.0 24.75 113500.0 113500.0 114500.0 112500.0 189971.0 557
9 2018-08-03 1.80 -6462.0 26766.0 24.73 113000.0 111500.0 113000.0 111500.0 169521.0 554
10 2018-08-02 -1.33 -27250.0 -17886.0 24.79 111000.0 112500.0 115000.0 111000.0 311779.0 553
11 2018-08-01 0.00 -42943.0 4084.0 24.80 112500.0 112500.0 113500.0 111500.0 276459.0 552
12 2018-07-31 -3.43 -70958.0 -11308.0 24.81 112500.0 115000.0 116000.0 111500.0 654749.0 551
13 2018-07-30 -2.51 -13734.0 -31222.0 24.83 116500.0 118500.0 119000.0 116000.0 308410.0 550
14 2018-07-27 1.70 28854.0 58129.0 24.93 119500.0 117500.0 120000.0 116500.0 568921.0 547
15 2018-07-26 -0.42 37172.0 2000.0 24.95 117500.0 118000.0 118500.0 116000.0 262125.0 546
16 2018-07-25 0.85 40713.0 39328.0 24.93 118000.0 117500.0 118000.0 116000.0 310394.0 545
17 2018-07-24 1.30 41269.0 67939.0 24.88 117000.0 115500.0 118000.0 115000.0 502533.0 544
18 2018-07-23 0.43 34065.0 43679.0 24.80 115500.0 115000.0 117000.0 113500.0 346894.0 543
19 2018-07-20 0.88 36800.0 6457.0 24.78 115000.0 114000.0 115500.0 113000.0 184967.0 540
20 2018-07-19 0.88 46781.0 29197.0 24.75 114000.0 113000.0 115500.0 112500.0 267375.0 539
21 2018-07-18 -0.88 -15125.0 13660.0 24.71 113000.0 115000.0 116500.0 112000.0 351263.0 538
22 2018-07-17 -2.15 -14384.0 15561.0 24.69 114000.0 116000.0 116500.0 113500.0 372754.0 537
23 2018-07-16 -1.27 -33186.0 22353.0 24.70 116500.0 118000.0 118500.0 115500.0 236711.0 536
24 2018-07-13 0.43 -6532.0 32115.0 24.60 118000.0 118000.0 119000.0 116500.0 302155.0 533
25 2018-07-12 0.86 66627.0 -25038.0 24.49 117500.0 117000.0 118000.0 115500.0 444272.0 532
26 2018-07-11 -0.43 4623.0 -21174.0 24.51 116500.0 116000.0 117000.0 115000.0 250313.0 531
27 2018-07-10 -0.43 38397.0 -26867.0 24.53 117000.0 117500.0 118000.0 116000.0 222321.0 530
28 2018-07-09 2.62 30234.0 89783.0 24.56 117500.0 114500.0 117500.0 114500.0 394098.0 529
29 2018-07-06 -1.72 34988.0 -62084.0 24.39 114500.0 116500.0 117000.0 113000.0 383082.0 526
... ... ... ... ... ... ... ... ... ... ... ...
350 2017-03-13 -0.59 -59567.0 -48034.0 22.50 84800.0 85400.0 85500.0 84000.0 341028.0 46
351 2017-03-10 3.02 34114.0 -103037.0 22.48 85300.0 83500.0 85800.0 81600.0 501305.0 43
352 2017-03-09 -0.24 -49144.0 -10379.0 22.63 82800.0 83500.0 84900.0 82800.0 350630.0 42
353 2017-03-08 -0.84 -37614.0 -38619.0 22.67 83000.0 84200.0 84300.0 82700.0 286714.0 41
354 2017-03-07 -0.12 -5745.0 -9126.0 22.72 83700.0 83600.0 84300.0 82600.0 277748.0 40
355 2017-03-06 -2.33 -15682.0 -39810.0 22.74 83800.0 85000.0 85700.0 83600.0 382382.0 39
356 2017-03-03 0.59 -572.0 -45660.0 22.80 85800.0 85000.0 86500.0 84400.0 396672.0 36
357 2017-03-02 -0.12 -28239.0 -40564.0 22.92 85300.0 86000.0 86100.0 84500.0 354383.0 35
358 2017-02-28 -0.47 -40453.0 -76359.0 22.90 85400.0 86500.0 86800.0 84900.0 444304.0 33
359 2017-02-27 -2.61 -27545.0 -34101.0 22.94 85800.0 87500.0 87900.0 85700.0 330736.0 32
360 2017-02-24 0.23 -45671.0 54418.0 22.84 88100.0 88500.0 89600.0 87600.0 369669.0 29
361 2017-02-23 -1.79 -24460.0 -5161.0 22.75 87900.0 89100.0 89500.0 87600.0 507957.0 28
362 2017-02-22 1.24 19282.0 273743.0 23.32 89500.0 88600.0 90200.0 88200.0 1034489.0 27
363 2017-02-21 4.25 116728.0 317758.0 23.27 88400.0 85700.0 89100.0 85700.0 1573781.0 26
364 2017-02-20 0.12 -33525.0 -17301.0 22.80 84800.0 84400.0 85300.0 83600.0 294768.0 25
365 2017-02-17 2.79 16158.0 95283.0 22.86 84700.0 82400.0 85300.0 82400.0 563691.0 22
366 2017-02-16 -2.94 -56432.0 -113276.0 22.90 82400.0 84600.0 84800.0 82200.0 569723.0 21
367 2017-02-15 0.00 -47046.0 51988.0 23.10 84900.0 85100.0 85600.0 84100.0 321230.0 20
368 2017-02-14 0.95 -5440.0 35941.0 23.05 84900.0 84000.0 85300.0 83000.0 427851.0 19
369 2017-02-13 -1.18 -19624.0 -9197.0 23.08 84100.0 85400.0 85500.0 83700.0 397630.0 18
370 2017-02-10 -0.35 25600.0 102474.0 23.04 85100.0 85300.0 86300.0 84100.0 792741.0 15
371 2017-02-09 7.02 135269.0 224265.0 22.80 85400.0 81700.0 85700.0 81300.0 1550608.0 14
372 2017-02-08 1.92 -8779.0 -5673.0 22.34 79800.0 78800.0 79800.0 77800.0 269292.0 13
373 2017-02-07 1.29 16895.0 -13154.0 22.30 78300.0 77000.0 79100.0 77000.0 173244.0 12
374 2017-02-06 -0.90 -7804.0 -17389.0 22.27 77300.0 78100.0 78500.0 77000.0 169147.0 11
375 2017-02-03 0.39 -10041.0 -35163.0 22.30 78000.0 78000.0 78500.0 77500.0 178240.0 8
376 2017-02-02 -0.89 -2380.0 -39118.0 22.33 77700.0 79100.0 80000.0 77600.0 285670.0 7
377 2017-02-01 0.90 11066.0 -40249.0 22.34 78400.0 78200.0 78900.0 78000.0 197510.0 6
378 2017-01-31 0.78 12957.0 -14740.0 22.47 77700.0 77200.0 79300.0 76900.0 260779.0 5
379 2017-01-26 1.31 -75560.0 51328.0 22.45 77100.0 76200.0 77700.0 76200.0 316390.0 0

380 rows × 11 columns

Save as a pickle

In [29]:
totalstock.to_pickle("{0}.bz2".format(code))