Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Now CSV files start in 2021. Previous years are zipped. #13

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 78 additions & 42 deletions fundspy/fundspy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,57 +37,88 @@ def cvm_informes (year: int, mth: int) -> pd.DataFrame:
mth (int): The month of the report the function should download\n

<b>Returns:</b>\n
pd.DataFrame: Pandas dataframe with the report for the given month and year. If the year is previous to 2017, will contain data regarding the whole year
pd.DataFrame: Pandas dataframe with the report for the given month and year. If the year is previous to 2021, will contain data regarding the whole year

"""

if int(year) >= 2017: #uses download process from reports after the year of 2017
try:
csv_path=f"C:/Users/asbra/Documents/Financas/BD/Cotacoes/inf_diario_fi_{year}{mth:02d}.csv"
print(f'Looking for csv in {csv_path}')
cotas = pd.read_csv(csv_path, sep =';')
cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime
try:
mth = f"{mth:02d}"
year = str(year)
#creates url using the parameters provided to the function
url = 'http://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_'+year+mth+'.csv'
#removes column present in only a few reports to avoid inconsistency when making the union of reports
cotas.drop(columns = ['TP_FUNDO'], inplace = True)
except KeyError:
pass
return cotas
except:
print(f'{year}-{mth}: theres no report for this date yet!.\n')

# if int(year) >= 2021: #uses download process from reports after the year of 2021
# try:
# mth = f"{mth:02d}"
# year = str(year)
# #creates url using the parameters provided to the function
# url = 'http://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_'+year+mth+'.csv'

#reads the csv returned by the link
cotas = pd.read_csv(url, sep =';')
cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime
# #reads the csv returned by the link
# cotas = pd.read_csv(url, sep =';')
# cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime

try:
#removes column present in only a few reports to avoid inconsistency when making the union of reports
cotas.drop(columns = ['TP_FUNDO'], inplace = True)
except KeyError:
pass
# try:
# #removes column present in only a few reports to avoid inconsistency when making the union of reports
# cotas.drop(columns = ['TP_FUNDO'], inplace = True)
# except KeyError:
# pass

return cotas
except HTTPError:
print('theres no report for this date yet!.\n')
# return cotas
# except:
# try:
# csv_path=f"C:/Users/asbra/Documents/Financas/BD/Cotacoes/inf_diario_fi_{year}{mth}.csv"
# print(f'Looking for csv in {csv_path}')
# cotas = pd.read_csv(csv_path, sep =';')
# cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime
# try:
# #removes column present in only a few reports to avoid inconsistency when making the union of reports
# cotas.drop(columns = ['TP_FUNDO'], inplace = True)
# except KeyError:
# pass
# return cotas
# except:
# print(f'{year}-{mth}: theres no report for this date yet!.\n')

if int(year) < 2017:
try:
year = str(year)
# if int(year) < 2021:
# try:
# year = str(year)

url = 'http://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/HIST/inf_diario_fi_' + year + '.zip'
#sends request to the url
r = requests.get(url, stream=True, allow_redirects=True)
# url = 'http://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/HIST/inf_diario_fi_' + year + '.zip'
# #sends request to the url
# r = requests.get(url, stream=True, allow_redirects=True)

with open('informe' + year + '.zip', 'wb') as fd: #writes the .zip file downloaded
fd.write(r.content)
# with open('informe' + year + '.zip', 'wb') as fd: #writes the .zip file downloaded
# fd.write(r.content)

zip_inf = zipfile.ZipFile('informe' + year + '.zip') #opens the .zip file
# zip_inf = zipfile.ZipFile('informe' + year + '.zip') #opens the .zip file

#le os arquivos csv dentro do arquivo zip
informes = [pd.read_csv(zip_inf.open(f), sep=";") for f in zip_inf.namelist()]
cotas = pd.concat(informes,ignore_index=True)
# #le os arquivos csv dentro do arquivo zip
# informes = [pd.read_csv(zip_inf.open(f), sep=";") for f in zip_inf.namelist()]
# cotas = pd.concat(informes,ignore_index=True)

cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime
# cotas['DT_COMPTC'] = pd.to_datetime(cotas['DT_COMPTC']) #casts date column to datetime

zip_inf.close() #fecha o arquivo zip
os.remove('informe' + year + '.zip') #deletes .zip file
# try:
# #removes column present in only a few reports to avoid inconsistency when making the union of reports
# cotas.drop(columns = ['TP_FUNDO'], inplace = True)
# except KeyError:
# pass

return cotas
# zip_inf.close() #fecha o arquivo zip
# os.remove('informe' + year + '.zip') #deletes .zip file

# return cotas

except Exception as E:
print(E)
# except Exception as E:
# print(E)


def start_db(db_dir: str = 'investments_database.db', start_year: int = 2005, target_funds: list = []):
Expand All @@ -112,11 +143,11 @@ def start_db(db_dir: str = 'investments_database.db', start_year: int = 2005, ta
#downloads each report in the cvm website and pushes it to the sql database daily_quotas table
print('downloading daily reports from the CVM website... \n')

#for each year between 2017 and now
#for each year between 2021 and now
for year in tqdm(range(start_year, datetime.date.today().year + 1), position = 0, leave=True):
for mth in range(1, 13): #for each month
#loop structure for years equal or after 2017
if year>=2017:
#loop structure for years equal or after 2021
if year>=2021:
informe = cvm_informes(str(year), mth)

try:
Expand All @@ -127,7 +158,7 @@ def start_db(db_dir: str = 'investments_database.db', start_year: int = 2005, ta
except AttributeError:
pass

elif year<2017: #loop structure to handle years before 2017 (they have a different file structure)
elif year<2021: #loop structure to handle years before 2021 (they have a different file structure)
#only executes the download function once every year to avoid duplicates (unique file for each year)
if mth == 12:
informe = cvm_informes(str(year), mth)
Expand Down Expand Up @@ -244,6 +275,9 @@ def update_db(db_dir: str = r'investments_database.db'):
last_quota = Cal.sub_working_days(last_update, 2) #date of the last published cvm repport
num_months = (today.year - last_quota.year) * 12 + (today.month - last_quota.month) + 1

print(f'Today : {today}')
print(f'Last update: {last_update} -> last update from the log table')
print(f'Last quota : {last_quota} -> date of the last published cvm repport')

##STEP 3
#delete information that will be updated from the database tables
Expand All @@ -270,7 +304,7 @@ def update_db(db_dir: str = r'investments_database.db'):
except DatabaseError:
target_funds = []

print('downloading new daily reports from the CVM website...\n')
print('reading files with daily reports from CVM...\n')
# downloads the daily cvm repport for each month between the last update and today
for m in range(num_months+1):
data_alvo = last_quota + relativedelta(months=+m)
Expand All @@ -284,7 +318,9 @@ def update_db(db_dir: str = r'investments_database.db'):

#downloads cadastral information from CVM of the fundos and pushes it to the database
print('downloading updated cadastral information from cvm...\n')
info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1',
# cad_fi_csv = 'http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv'
cad_fi_csv = r"C:\Users\asbra\Documents\Financas\BD\Fundos\cad_fi.csv"
info_cad = pd.read_csv(cad_fi_csv, sep = ';', encoding='latin1',
dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object,
'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object,
'CONTROLADOR': object}
Expand Down
4 changes: 4 additions & 0 deletions fundspy/start_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#starts a database

from fundspy import cvm_informes, start_db
start_db(db_dir = 'investments_database_2017.db', start_year = 2017, target_funds = [])
3 changes: 3 additions & 0 deletions fundspy/update_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#updates a database
from fundspy import cvm_informes, update_db
update_db(db_dir = r'../investments_database_2017.db')