Skip to content
This repository has been archived by the owner on Feb 28, 2023. It is now read-only.

Commit

Permalink
Closes #6, closes #19
Browse files Browse the repository at this point in the history
  • Loading branch information
Mincka committed Jul 9, 2017
1 parent 0c365d4 commit e5ef8e1
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 83 deletions.
22 changes: 12 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ I have made this tool to retrieve all the tweets from my private conversations a
[2016-09-07 10:38:10] <Steve> You guys are ridiculous! 😂
```

This tool is also able to **download all the uploaded images** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space).
This tool is also able to **download all the uploaded images and videos** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space).

You may have found suggestions to use the Twitter's archive feature to do the same but Direct Messages are not included in the generated archive.

Expand Down Expand Up @@ -76,31 +76,33 @@ $ pip3 install dmarchiver --upgrade

### Command line tool
```
$ dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg]
$ dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
$ dmarchiver --help
usage: cmdline.py [-h] [-id CONVERSATION_ID] [-di] [-dg]
usage: cmdline.py [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
optional arguments:
-h, --help show this help message and exit
-id CONVERSATION_ID, --conversation_id CONVERSATION_ID
Conversation ID
-u, --username Username (e-mail or handle)
-p, --password Password
-u, --username Username (e-mail or handle)
-p, --password Password
-di, --download-images
Download images
-dg, --download-gifs Download GIFs (as MP4)
-r, --raw-output Write the raw HTML to a file
-dg, --download-videos
Download videos (as MP4)
-r, --raw-output Write the raw HTML to a file
```

### Examples

#### Archive all conversations with images:
`$ dmarchiver -di`
#### Archive all conversations with images and videos:
`$ dmarchiver -di -dv`

The script output will be the `645754097571131337.txt` file with the conversation formatted in an _IRC-like_ style.

The images and GIFs files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4` folders.
The images and videos files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4-*` folders.

#### Archive a specific conversation:
To retrieve only one conversation with the ID `645754097571131337`:
Expand Down Expand Up @@ -133,7 +135,7 @@ for (var i = 0; i < conversations.length; i++) {
#### Schedule a task to perform incremental backups of a conversation
You can also specify the username and the password in the options. Because DMArchiver is able to perform incremental updates, you can schedule a task or create a shortcut with the following arguments:

`$ dmarchiver -id "conversation_id" -di -dg -u your_username -p your_password`
`$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password`

### Module import
```python
Expand Down
2 changes: 1 addition & 1 deletion dmarchiver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
without having to deal with the API limitations.
"""

__version__ = "0.1.6"
__version__ = "0.1.7"
50 changes: 31 additions & 19 deletions dmarchiver/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Direct Messages Archiver - Command Line
Usage:
# dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg]
# dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
optional arguments:
-h, --help show this help message and exit
Expand All @@ -15,6 +15,8 @@
-di, --download-images
Download images
-dg, --download-gifs Download GIFs (as MP4)
-dv, --download-videos
Download videos (as MP4)
-r, --raw-output Write the raw HTML to a file
"""

Expand Down Expand Up @@ -44,6 +46,11 @@ def main():
"--download-gifs",
help="Download GIFs (as MP4)",
action="store_true")
parser.add_argument(
"-dv",
"--download-videos",
help="Download videos (as MP4)",
action="store_true")
parser.add_argument(
"-r",
"--raw-output",
Expand All @@ -70,26 +77,31 @@ def main():
print('Error: {0}'.format(err.args[0]))
print('Exiting.')
sys.exit()

print('Press Ctrl+C at anytime to write the current conversation and skip to the next one.\n Keep it pressed to exit the script.\n')

if args.conversation_id is not None:
# Prevent error when using '' instead of ""
conversation_id = args.conversation_id.strip('\'')
print(
'Conversation ID specified ({0}). Retrieving only one thread.'.format(
args.conversation_id))
crawler.crawl(
conversation_id,
args.download_images,
args.download_gifs, args.raw_output)
else:
print('Conversation ID not specified. Retrieving all the threads.')
threads = crawler.get_threads()
print('{0} thread(s) found.'.format(len(threads)))

for thread_id in threads:
crawler.crawl(thread_id, args.download_images,
args.download_gifs, args.raw_output)
try:
if args.conversation_id is not None:
# Prevent error when using '' instead of ""
conversation_id = args.conversation_id.strip('\'')
print(
'Conversation ID specified ({0}). Retrieving only one thread.'.format(
args.conversation_id))
crawler.crawl(
conversation_id,
args.download_images,
args.download_gifs, args.raw_output)
else:
print('Conversation ID not specified. Retrieving all the threads.')
threads = crawler.get_threads()
print('{0} thread(s) found.'.format(len(threads)))

for thread_id in threads:
crawler.crawl(thread_id, args.download_images,
args.download_gifs, args.download_videos, args.raw_output)
except KeyboardInterrupt:
print('Script execution interruption requested. Exiting.')
sys.exit()

if __name__ == "__main__":
main()
102 changes: 63 additions & 39 deletions dmarchiver/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ def _parse_dm_media(
tweet_id,
time_stamp,
download_images,
download_gif):
download_gifs,
download_videos):
media_url = ''
media_preview_url = ''
media_alt = ''
Expand Down Expand Up @@ -438,19 +439,32 @@ def _parse_dm_media(
media_filename = '{0}-{1}-{2}'.format(formatted_timestamp, media_filename_re[0][
0], media_filename_re[0][1])

if download_gif:
if download_gifs:
response = self._session.get(media_url, stream=True)
if response.status_code == 200:
os.makedirs(
'{0}/mp4'.format(self._conversation_id), exist_ok=True)
with open('{0}/mp4/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
'{0}/mp4-gifs'.format(self._conversation_id), exist_ok=True)
with open('{0}/mp4-gifs/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)
elif len(video_url) > 0:
media_type = MediaType.video
media_style = video_url[0].find('div').get('style')
media_preview_url = re.findall('url\(\'(.*?)\'\)', media_style)[0]
media_url = 'https://twitter.com/i/videos/dm/' + tweet_id
video_url = 'https://mobile.twitter.com/messages/media/' + tweet_id
media_filename = '{0}-{1}.mp4'.format(
formatted_timestamp, tweet_id)

if download_videos:
response = self._session.get(video_url, stream=True)
if response.status_code == 200:
os.makedirs(
'{0}/mp4-videos'.format(self._conversation_id), exist_ok=True)
with open('{0}/mp4-videos/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)

else:
print('Unknown media')

Expand All @@ -471,7 +485,7 @@ def _parse_dm_card(self, element):
card.get('data-card-url'),
card.get('data-card-name'))

def _process_tweets(self, tweets, download_images, download_gif, max_id):
def _process_tweets(self, tweets, download_images, download_gifs, download_videos, max_id):
conversation_set = collections.OrderedDict()
ordered_tweets = sorted(tweets, reverse=True)

Expand Down Expand Up @@ -534,7 +548,7 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id):
message.elements.append(element_object)
elif 'DirectMessage-media' in dm_element_type:
element_object = self._parse_dm_media(
dm_element, tweet_id, time_stamp, download_images, download_gif)
dm_element, tweet_id, time_stamp, download_images, download_gifs, download_videos)
message.elements.append(element_object)
elif 'DirectMessage-tweet' in dm_element_type:
element_object = self._parse_dm_tweet(dm_element)
Expand All @@ -548,6 +562,11 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id):
elif len(dm_conversation_entry) > 0:
dm_element_text = dm_conversation_entry[0].text.strip()
message = DMConversationEntry(tweet_id, dm_element_text)
except KeyboardInterrupt:
print(
'Script execution interruption requested. Writing the conversation.')
self._max_id_found = True
break
except:
print(
'Unexpected error for tweet \'{0}\', raw HTML will be used for the tweet.'.format(tweet_id))
Expand All @@ -564,7 +583,8 @@ def crawl(
self,
conversation_id,
download_images=False,
download_gif=False,
download_gifs=False,
download_videos=False,
raw_output=False):

raw_output_file = None
Expand All @@ -585,38 +605,42 @@ def crawl(
payload = {'id': conversation_id}
processed_tweet_counter = 0

while True and self._max_id_found == False:
response = self._session.get(
conversation_url,
headers=self._ajax_headers,
params=payload)

json = response.json()

if 'max_entry_id' not in json:
print('Begin of thread reached')
break

payload = {'id': conversation_id,
'max_entry_id': json['min_entry_id']}

tweets = json['items']

if raw_output:
ordered_tweets = sorted(tweets, reverse=True)
for tweet_id in ordered_tweets:
raw_output_file.write(tweets[tweet_id].encode('UTF-8'))

# Get tweets for the current request
conversation_set = self._process_tweets(
tweets, download_images, download_gif, max_id)

# Append to the whole conversation
for tweet_id in conversation_set:
processed_tweet_counter += 1
conversation.tweets[tweet_id] = conversation_set[tweet_id]
print('Processed tweets: {0}\r'.format(
processed_tweet_counter), end='')
try:
while True and self._max_id_found == False:
response = self._session.get(
conversation_url,
headers=self._ajax_headers,
params=payload)

json = response.json()

if 'max_entry_id' not in json:
print('Begin of thread reached')
break

payload = {'id': conversation_id,
'max_entry_id': json['min_entry_id']}

tweets = json['items']

if raw_output:
ordered_tweets = sorted(tweets, reverse=True)
for tweet_id in ordered_tweets:
raw_output_file.write(tweets[tweet_id].encode('UTF-8'))

# Get tweets for the current request
conversation_set = self._process_tweets(
tweets, download_images, download_gifs, download_videos, max_id)

# Append to the whole conversation
for tweet_id in conversation_set:
processed_tweet_counter += 1
conversation.tweets[tweet_id] = conversation_set[tweet_id]
print('Processed tweets: {0}\r'.format(
processed_tweet_counter), end='')
except KeyboardInterrupt:
print(
'Script execution interruption requested. Writing this conversation.')

if raw_output:
raw_output_file.close()
Expand Down
28 changes: 14 additions & 14 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
from setuptools import setup, find_packages
import codecs
from setuptools import setup, find_packages
import dmarchiver

setup(

name='dmarchiver',
version=dmarchiver.__version__,

packages=find_packages(),

install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'],
install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'],

author="Julien EHRHART",
author_email="[email protected]",

description="A tool to archive the direct messages from your private conversations on Twitter.",
long_description=open('README.md').read(),

long_description=codecs.open('README.md', 'r', 'utf-8').read(),

include_package_data=True,

url='https://github.com/Mincka/DMArchiver',

classifiers=[
"Programming Language :: Python",
"Development Status :: 3 - Alpha",
Expand All @@ -32,12 +32,12 @@
"Topic :: System :: Archiving",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
],
entry_points = {

entry_points={
'console_scripts': [
'dmarchiver = dmarchiver.cmdline:main',
],
},

license="GNU General Public License v3 (GPLv3)",
)

0 comments on commit e5ef8e1

Please sign in to comment.