Closes #6, closes #19

Mincka · Jul 9, 2017 · e5ef8e1 · e5ef8e1
1 parent 0c365d4
commit e5ef8e1
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ I have made this tool to retrieve all the tweets from my private conversations a
 [2016-09-07 10:38:10] <Steve> You guys are ridiculous! 😂
 ```
 
-This tool is also able to **download all the uploaded images** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space).
+This tool is also able to **download all the uploaded images and videos** in their original resolution and, as a bonus, also retrieve the **GIFs** you used in your conversations as MP4 files (the format used by Twitter to optimize them and save space).
 
 You may have found suggestions to use the Twitter's archive feature to do the same but Direct Messages are not included in the generated archive.
 
@@ -76,31 +76,33 @@ $ pip3 install dmarchiver --upgrade
 
 ### Command line tool
 ```
-$ dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg]
+$ dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
 
 $ dmarchiver --help
- usage: cmdline.py [-h] [-id CONVERSATION_ID] [-di] [-dg]
+ usage: cmdline.py [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
  
  optional arguments:
  -h, --help show this help message and exit
  -id CONVERSATION_ID, --conversation_id CONVERSATION_ID
  Conversation ID
-  -u, --username Username (e-mail or handle)
-  -p, --password Password
+  -u, --username Username (e-mail or handle)
+  -p, --password Password
  -di, --download-images
  Download images
  -dg, --download-gifs Download GIFs (as MP4)
- -r, --raw-output Write the raw HTML to a file
+ -dg, --download-videos
+ Download videos (as MP4)
+ -r, --raw-output Write the raw HTML to a file
 ```
 
 ### Examples
 
-#### Archive all conversations with images:
-`$ dmarchiver -di`
+#### Archive all conversations with images and videos:
+`$ dmarchiver -di -dv`
 
 The script output will be the `645754097571131337.txt` file with the conversation formatted in an _IRC-like_ style.
 
-The images and GIFs files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4` folders.
+The images and videos files can be respectively found in the `645754097571131337/images` and `645754097571131337/mp4-*` folders.
 
 #### Archive a specific conversation:
 To retrieve only one conversation with the ID `645754097571131337`:
@@ -133,7 +135,7 @@ for (var i = 0; i < conversations.length; i++) {
 #### Schedule a task to perform incremental backups of a conversation
 You can also specify the username and the password in the options. Because DMArchiver is able to perform incremental updates, you can schedule a task or create a shortcut with the following arguments:
 
-`$ dmarchiver -id "conversation_id" -di -dg -u your_username -p your_password`
+`$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password`
 
 ### Module import
 ```python

diff --git a/dmarchiver/__init__.py b/dmarchiver/__init__.py
@@ -5,4 +5,4 @@
  without having to deal with the API limitations.
 """
 
-__version__ = "0.1.6"
+__version__ = "0.1.7"
diff --git a/dmarchiver/cmdline.py b/dmarchiver/cmdline.py
@@ -4,7 +4,7 @@
  Direct Messages Archiver - Command Line
 
  Usage:
- # dmarchiver [-h] [-id CONVERSATION_ID] [-di] [-dg]
+ # dmarchiver [-h] [-id CONVERSATION_ID] [-u] [-p] [-di] [-dg] [-dv]
 
  optional arguments:
  -h, --help show this help message and exit
@@ -15,6 +15,8 @@
  -di, --download-images
  Download images
  -dg, --download-gifs Download GIFs (as MP4)
+ -dv, --download-videos
+ Download videos (as MP4)
  -r, --raw-output Write the raw HTML to a file
 """
 
@@ -44,6 +46,11 @@ def main():
  "--download-gifs",
  help="Download GIFs (as MP4)",
  action="store_true")
+ parser.add_argument(
+ "-dv",
+ "--download-videos",
+ help="Download videos (as MP4)",
+ action="store_true")
  parser.add_argument(
  "-r",
  "--raw-output",
@@ -70,26 +77,31 @@ def main():
  print('Error: {0}'.format(err.args[0]))
  print('Exiting.')
  sys.exit()
+
+ print('Press Ctrl+C at anytime to write the current conversation and skip to the next one.\n Keep it pressed to exit the script.\n')
 
- if args.conversation_id is not None:
- # Prevent error when using '' instead of ""
- conversation_id = args.conversation_id.strip('\'')
- print(
- 'Conversation ID specified ({0}). Retrieving only one thread.'.format(
- args.conversation_id))
- crawler.crawl(
- conversation_id,
- args.download_images,
- args.download_gifs, args.raw_output)
- else:
- print('Conversation ID not specified. Retrieving all the threads.')
- threads = crawler.get_threads()
- print('{0} thread(s) found.'.format(len(threads)))
-
- for thread_id in threads:
- crawler.crawl(thread_id, args.download_images,
- args.download_gifs, args.raw_output)
+ try:
+ if args.conversation_id is not None:
+ # Prevent error when using '' instead of ""
+ conversation_id = args.conversation_id.strip('\'')
+ print(
+ 'Conversation ID specified ({0}). Retrieving only one thread.'.format(
+ args.conversation_id))
+ crawler.crawl(
+ conversation_id,
+ args.download_images,
+ args.download_gifs, args.raw_output)
+ else:
+ print('Conversation ID not specified. Retrieving all the threads.')
+ threads = crawler.get_threads()
+ print('{0} thread(s) found.'.format(len(threads)))
 
+ for thread_id in threads:
+ crawler.crawl(thread_id, args.download_images,
+ args.download_gifs, args.download_videos, args.raw_output)
+ except KeyboardInterrupt:
+ print('Script execution interruption requested. Exiting.')
+ sys.exit()
 
 if __name__ == "__main__":
  main()
diff --git a/dmarchiver/core.py b/dmarchiver/core.py
@@ -389,7 +389,8 @@ def _parse_dm_media(
  tweet_id,
  time_stamp,
  download_images,
- download_gif):
+ download_gifs,
+ download_videos):
  media_url = ''
  media_preview_url = ''
  media_alt = ''
@@ -438,19 +439,32 @@ def _parse_dm_media(
  media_filename = '{0}-{1}-{2}'.format(formatted_timestamp, media_filename_re[0][
  0], media_filename_re[0][1])
 
- if download_gif:
+ if download_gifs:
  response = self._session.get(media_url, stream=True)
  if response.status_code == 200:
  os.makedirs(
- '{0}/mp4'.format(self._conversation_id), exist_ok=True)
- with open('{0}/mp4/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
+ '{0}/mp4-gifs'.format(self._conversation_id), exist_ok=True)
+ with open('{0}/mp4-gifs/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
  response.raw.decode_content = True
  shutil.copyfileobj(response.raw, file)
  elif len(video_url) > 0:
  media_type = MediaType.video
  media_style = video_url[0].find('div').get('style')
  media_preview_url = re.findall('url\(\'(.*?)\'\)', media_style)[0]
  media_url = 'https://twitter.com/i/videos/dm/' + tweet_id
+ video_url = 'https://mobile.twitter.com/messages/media/' + tweet_id
+ media_filename = '{0}-{1}.mp4'.format(
+ formatted_timestamp, tweet_id)
+
+ if download_videos:
+ response = self._session.get(video_url, stream=True)
+ if response.status_code == 200:
+ os.makedirs(
+ '{0}/mp4-videos'.format(self._conversation_id), exist_ok=True)
+ with open('{0}/mp4-videos/{1}'.format(self._conversation_id, media_filename), 'wb') as file:
+ response.raw.decode_content = True
+ shutil.copyfileobj(response.raw, file)
+
  else:
  print('Unknown media')
 
@@ -471,7 +485,7 @@ def _parse_dm_card(self, element):
  card.get('data-card-url'),
  card.get('data-card-name'))
 
- def _process_tweets(self, tweets, download_images, download_gif, max_id):
+ def _process_tweets(self, tweets, download_images, download_gifs, download_videos, max_id):
  conversation_set = collections.OrderedDict()
  ordered_tweets = sorted(tweets, reverse=True)
 
@@ -534,7 +548,7 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id):
  message.elements.append(element_object)
  elif 'DirectMessage-media' in dm_element_type:
  element_object = self._parse_dm_media(
- dm_element, tweet_id, time_stamp, download_images, download_gif)
+ dm_element, tweet_id, time_stamp, download_images, download_gifs, download_videos)
  message.elements.append(element_object)
  elif 'DirectMessage-tweet' in dm_element_type:
  element_object = self._parse_dm_tweet(dm_element)
@@ -548,6 +562,11 @@ def _process_tweets(self, tweets, download_images, download_gif, max_id):
  elif len(dm_conversation_entry) > 0:
  dm_element_text = dm_conversation_entry[0].text.strip()
  message = DMConversationEntry(tweet_id, dm_element_text)
+ except KeyboardInterrupt:
+ print(
+ 'Script execution interruption requested. Writing the conversation.')
+ self._max_id_found = True
+ break
  except:
  print(
  'Unexpected error for tweet \'{0}\', raw HTML will be used for the tweet.'.format(tweet_id))
@@ -564,7 +583,8 @@ def crawl(
  self,
  conversation_id,
  download_images=False,
- download_gif=False,
+ download_gifs=False,
+ download_videos=False,
  raw_output=False):
 
  raw_output_file = None
@@ -585,38 +605,42 @@ def crawl(
  payload = {'id': conversation_id}
  processed_tweet_counter = 0
 
- while True and self._max_id_found == False:
- response = self._session.get(
- conversation_url,
- headers=self._ajax_headers,
- params=payload)
-
- json = response.json()
-
- if 'max_entry_id' not in json:
- print('Begin of thread reached')
- break
-
- payload = {'id': conversation_id,
- 'max_entry_id': json['min_entry_id']}
-
- tweets = json['items']
-
- if raw_output:
- ordered_tweets = sorted(tweets, reverse=True)
- for tweet_id in ordered_tweets:
- raw_output_file.write(tweets[tweet_id].encode('UTF-8'))
-
- # Get tweets for the current request
- conversation_set = self._process_tweets(
- tweets, download_images, download_gif, max_id)
-
- # Append to the whole conversation
- for tweet_id in conversation_set:
- processed_tweet_counter += 1
- conversation.tweets[tweet_id] = conversation_set[tweet_id]
- print('Processed tweets: {0}\r'.format(
- processed_tweet_counter), end='')
+ try:
+ while True and self._max_id_found == False:
+ response = self._session.get(
+ conversation_url,
+ headers=self._ajax_headers,
+ params=payload)
+
+ json = response.json()
+
+ if 'max_entry_id' not in json:
+ print('Begin of thread reached')
+ break
+
+ payload = {'id': conversation_id,
+ 'max_entry_id': json['min_entry_id']}
+
+ tweets = json['items']
+
+ if raw_output:
+ ordered_tweets = sorted(tweets, reverse=True)
+ for tweet_id in ordered_tweets:
+ raw_output_file.write(tweets[tweet_id].encode('UTF-8'))
+
+ # Get tweets for the current request
+ conversation_set = self._process_tweets(
+ tweets, download_images, download_gifs, download_videos, max_id)
+
+ # Append to the whole conversation
+ for tweet_id in conversation_set:
+ processed_tweet_counter += 1
+ conversation.tweets[tweet_id] = conversation_set[tweet_id]
+ print('Processed tweets: {0}\r'.format(
+ processed_tweet_counter), end='')
+ except KeyboardInterrupt:
+ print(
+ 'Script execution interruption requested. Writing this conversation.')
 
  if raw_output:
  raw_output_file.close()

diff --git a/setup.py b/setup.py
@@ -1,27 +1,27 @@
-from setuptools import setup, find_packages
- 
+import codecs
+from setuptools import setup, find_packages
 import dmarchiver
 
 setup(
- 
+
  name='dmarchiver',
  version=dmarchiver.__version__,
- 
+
  packages=find_packages(),
 
- install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'], 
+ install_requires=['requests==2.11.1', 'lxml==3.6.4', 'cssselect==0.9.2'],
 
  author="Julien EHRHART",
  author_email="[email protected]",
- 
+
  description="A tool to archive the direct messages from your private conversations on Twitter.",
- 
- long_description=open('README.md').read(),
- 
+
+ long_description=codecs.open('README.md', 'r', 'utf-8').read(),
+
  include_package_data=True,
- 
+
  url='https://github.com/Mincka/DMArchiver',
- 
+
  classifiers=[
  "Programming Language :: Python",
  "Development Status :: 3 - Alpha",
@@ -32,12 +32,12 @@
  "Topic :: System :: Archiving",
  "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
  ],
- 
- entry_points = {
+
+ entry_points={
  'console_scripts': [
  'dmarchiver = dmarchiver.cmdline:main',
  ],
  },
- 
+
  license="GNU General Public License v3 (GPLv3)",
 )