处理字符串中嵌入的换行符

我正在处理以jsonl形式获取的twitter数据。我已经将其转换为json,并尝试将其转换为csv(以导入到接受csv或MySQL的程序中)。但是,有些人在他们的推文或个人资料中加入了强制的新词条。这导致csv文件具有多行输入项,通常在推文中间中断。我已经尝试了一些python json到github上浮动的csv代码。

我尝试过的最新尝试:

jq -s "." tiny00subset.jsonl > tiny00subset.json
json2csv -i tiny00subset.json -o tiny00subset.csv

部分示例鸣叫(json格式):

  {
"created_at": "Mon Aug 13 10:40:34 +0000 2018","id": 1028954459110555600,"id_str": "1028954459110555649","full_text": "Oh well,they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk","truncated": false,"display_text_range": [
  0,131
],"entities": {
  "hashtags": [
    {
      "text": "climatechange","indices": [
        117,131
      ]
    }
  ],"symbols": [],"user_mentions": [],"urls": [
    {
      "url": "https://REPLACED/DuBGmHCnG8","expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/","display_url": "tamino.wordpress.com/2018/08/08/usa…","indices": [
        93,116
      ]
    },{
      "url": "https://REPLACED/d5IBchM3Uk","expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720","display_url": "twitter.com/Tony__Heller/s…","indices": [
        132,155
      ]
    }
  ]
},

}

CSV输出:

    "Mon Aug 13 10:40:34 +0000 2018",1028954459110555600,"1028954459110555649","Oh well,they deal with it quite well. Like they add numbers and facts and such crazy stuff.
    https://REPLACED/DuBGmHCnG8
    #climatechange https://REPLACED/d5IBchM3Uk",false,"[0,131]","{""hashtags"":[{""text"":""climatechange"",""indices"":[117,131]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""https://REPLACED/DuBGmHCnG8"",""expanded_url"":""https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/"",""display_url"":""tamino.wordpress.com/2018/08/08/usa…"",""indices"":[93,116]},{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded_url"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display_url"":""twitter.com/Tony__Heller/s…"",""indices"":[132,155]}]}","<a href=""https://about.twitter.com/products/tweetdeck"" rel=""nofollow"">TweetDeck</a>","{""id"":59806323,""id_str"":""59806323"",""name"":""Daniel"",""screen_name"":""sleeksorrow"",""location"":""Karlsruhe,Germany"",""description"":""Politik,IT,Blödsinn und deren Schnittmenge. Ebenfalls: Hochmittelalter Darstellung,Falknerei,Greifvogelschutz - profile picture by @herrkausk"",""url"":""https://REPLACED/E8aNHIhCtg"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/E8aNHIhCtg"",""expanded_url"":""http://sleeksorrow.blogspot.com/"",""display_url"":""sleeksorrow.blogspot.com"",""indices"":[0,23]}]},""description"":{""urls"":[]}},""protected"":false,""followers_count"":572,""friends_count"":392,""listed_count"":47,""created_at"":""Fri Jul 24 15:15:25 +0000 2009"",""favourites_count"":13259,""utc_offset"":null,""time_zone"":null,""geo_enabled"":false,""verified"":false,""statuses_count"":48861,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""1A1B1F"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/59806323/1397029131"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""2FC2EF"",""profile_sidebar_border_color"":""181A1E"",""profile_sidebar_fill_color"":""252429"",""profile_text_color"":""666666"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":true,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""}",true,1028672939753758700,"1028672939753758720","{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display"":""twitter.com/Tony__Heller/s…""}","{""created_at"":""Sun Aug 12 16:01:55 +0000 2018"",""id"":1028672939753758700,""id_str"":""1028672939753758720"",""full_text"":""@DeanFieldingF1 It is very difficult or impossible for climate alarmists to deal with reality. https://REPLACED/wOJTptxIqH"",""truncated"":false,""display_text_range"":[16,94],""entities"":{""hashtags"":[],""user_mentions"":[{""screen_name"":""DeanFieldingF1"",""name"":""Dean Fielding"",""id"":797295219825897500,""id_str"":""797295219825897472"",15]}],""urls"":[],""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}}}]},""extended_entities"":{""media"":[{""id"":1028672868849090600,""small"":{""faces"":[]}},""ext_alt_text"":null},{""id"":1028672883986333700,""id_str"":""1028672883986333697"",""media_url"":""http://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""ext_alt_text"":null}]},""source"":""<a href=\""http://twitter.com\"" rel=\""nofollow\"">Twitter Web Client</a>"",""in_reply_to_status_id"":1028671170802081800,""in_reply_to_status_id_str"":""1028671170802081793"",""in_reply_to_user_id"":797295219825897500,""in_reply_to_user_id_str"":""797295219825897472"",""in_reply_to_screen_name"":""DeanFieldingF1"",""user"":{""id"":435704007,""id_str"":""435704007"",""name"":""Tony Heller"",""screen_name"":""Tony__Heller"",""location"":""Colorado"",""description"":""https://REPLACED/j5CaDNyIqE"",""url"":""https://REPLACED/Pyn117xXna"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/Pyn117xXna"",""expanded_url"":""http://realclimatescience.com"",""display_url"":""realclimatescience.com"",""description"":{""urls"":[{""url"":""https://REPLACED/j5CaDNyIqE"",""expanded_url"":""https://realclimatescience.com/who-is-tony-heller/"",""display_url"":""realclimatescience.com/who-is-tony-he…"",23]}]}},""followers_count"":44955,""friends_count"":374,""listed_count"":886,""created_at"":""Tue Dec 13 10:44:34 +0000 2011"",""favourites_count"":3740,""geo_enabled"":true,""statuses_count"":165165,""profile_background_color"":""185370"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme1/bg.png"",""profile_image_url"":""http://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/435704007/1469798959"",""profile_link_color"":""0084B4"",""profile_sidebar_border_color"":""FFFFFF"",""profile_sidebar_fill_color"":""DDEEF6"",""profile_text_color"":""333333"",""can_media_tag"":false,""translator_type"":""none""},""geo"":null,""coordinates"":null,""place"":null,""contributors"":null,""is_quote_status"":false,""retweet_count"":16,""favorite_count"":27,""favorited"":false,""retweeted"":false,""possibly_sensitive"":false,""lang"":""en""}","en"
ysjln 回答:处理字符串中嵌入的换行符

开始
awk -F: '{if($3>1000) print $1} passwd | sort > users.txt

并运行(它是https://github.com/johnkerl/miller

awk -F: '$3>1000 {print $1}' passwd | sort > users.txt

您有这种输出https://gist.github.com/aborruso/6e0361923a3c45b9fe55ebf7590953de#file-output-csv

如果以raw格式打开它,则会收到回车符。一个电子表格可以正确阅读。

然后,正确使用您需要使用的导入过程,{ "created_at": "Mon Aug 13 10:40:34 +0000 2018","id": 1028954459110555600,"id_str": "1028954459110555649","full_text": "Oh well,they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk","truncated": false,"display_text_range": [ 0,131 ],"entities": { "hashtags": [ { "text": "climatechange","indices": [ 117,131 ] } ],"symbols": [],"user_mentions": [],"urls": [ { "url": "https://REPLACED/DuBGmHCnG8","expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/","display_url": "tamino.wordpress.com/2018/08/08/usa…","indices": [ 93,116 ] },{ "url": "https://REPLACED/d5IBchM3Uk","expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720","display_url": "twitter.com/Tony__Heller/s…","indices": [ 132,155 ] } ] } } 没问题。

enter image description here

本文链接:https://www.f2er.com/3082798.html

大家都在问